y23335793 发表于 2015-4-20 12:41:11

黄聪:Python网站采集功能(多线程的采集、WDPYSPIDER类、pycurl)

Python










1
import urllib








2
urlItem = urllib.urlopen("http://www.baidu.com")








3
htmSource = urlItem.read()








4
urlItem.close()








5
print htmSource



pycurl
  http://pycurl.sourceforge.net/download/
  http://pycurl.sourceforge.net/doc/curlobject.html
Python










01
import pycurl








02
c = pycurl.Curl()








03
c.setopt(pycurl.URL, "http://www.whiledo.com/")








04
c.setopt(pycurl.HTTPHEADER, ["Accept:"])








05
import StringIO








06
b = StringIO.StringIO()








07
c.setopt(pycurl.WRITEFUNCTION, b.write)








08
c.setopt(pycurl.FOLLOWLOCATION, 1)








09
c.setopt(pycurl.MAXREDIRS, 5)








10
c.perform()








11
print b.getvalue()








12
print c.getinfo(pycurl.INFO_FILETIME)



curl_easy_setopt


告诉 libcurl 的如何做事
CURLOPT_WRITEFUNCTION: 写(下载)回传函数,传递一个写指针供外部操作, 一次回调内容大小在 CURL_MAX_WRITE_SIZE (curl.h头文件)中设置
CURLOPT_WRITEDATA: 直接写文件,指定一个文件名如c.setopt(pycurl.WRITEDATA, 'E:\WebSite\py\1.txt') 注win下不能用
CURLOPT_READFUNCTION: 读(上传)回传函数
CURLOPT_SEEKFUNCTION: 数据指针移动,int function(void *instream, curl_off_t offset, int origin);SEEK_SET, SEEK_CUR and SEEK_END,返回CURL_SEEKFUNC_OK或CURL_SEEKFUNC_FAIL或CURL_SEEKFUNC_CANTSEEK (0,1,2)
CURLOPT_OPENSOCKETFUNCTION:
CURLOPT_HEADERFUNCTION:只接收头数据 size_t function( void *ptr, size_t size, size_t nmemb, void *userdata);
CURLOPT_DEBUGFUNCTION: int curl_debug_callback (CURL *, curl_infotype, char *, size_t, void *);
CURLOPT_VERBOSE: 参数设置为1 能显示更多详细信息
CURLOPT_HEADER: 设为 1 将在返回的文本中包含头信息
CURLOPT_NOSIGNAL: 不超时
CURLOPT_FOLLOWLOCATION:设置为1告诉libcurl遵循任何访问
CURLOPT_MAXREDIRS: 设定重定向的数目限制,设置为-1表示无限的重定向(默认)
CURLOPT_PUT:数据上载相关
CURLOPT_POST:
CURLOPT_POSTREDIR:
CURLOPT_POSTFIELDS:
CURLOPT_POSTFIELDSIZE:
CURLOPT_POSTFIELDSIZE_LARGE:
CURLOPT_COPYPOSTFIELDS:
CURLOPT_HTTPPOST:
CURLOPT_UPLOAD:
CURLOPT_AUTOREFERER:libcurl自动设置Referer
CURLOPT_REFERER: 伪造来源路径
CURLOPT_USERAGENT:自定义USERAGENT
CURLOPT_HTTPHEADER:自定义头
CURLOPT_COOKIE: "name1=content1; name2=content2;"
CURLOPT_COOKIEFILE:
CURLOPT_COOKIEJAR:
CURLOPT_COOKIESESSION: 默认情况下,libcurl始终加载和存储所有Cookie
CURLOPT_COOKIELIST
CURLOPT_HTTPGET
CURLOPT_HTTP_VERSION: CURL_HTTP_VERSION_NONE,CURL_HTTP_VERSION_1_0,CURL_HTTP_VERSION_1_1
CURLOPT_IGNORE_CONTENT_LENGTH:忽略内容长度头,针对类似Apache 1.x的服务器
CURLOPT_HTTP_TRANSFER_DECODING:告诉libcurl如何对传输解码,(0,=1)
CURLOPT_HTTP200ALIASES:自定义HTTP 200响应别名,有些服务器对200返回不是标准的
CURLOPT_ENCODING:设置接收的内容编码,同 Accept-Encoding, ('','gzip',....)
CURLOPT_UNRESTRICTED_AUTH:数设置为1,继续发送认证(用户+密码)
NETWORK OPTIONS
CURLOPT_URL: http://xxxx,ftp://xxxx
CURLOPT_PROXY:HTTP代理,主机名或IP地址
CURLOPT_PROXYPORT:代理端口,也可在PROXY的地址后加":端口",如 :8080
CURLOPT_PROXYTYPE:代理类型,CURLPROXY_HTTP(默认), CURLPROXY_HTTP_1_0,CURLPROXY_SOCKS4,CURLPROXY_SOCKS5,CURLPROXY_SOCKS4A,CURLPROXY_SOCKS5_HOSTNAME,
CURLOPT_NOPROXY:不使用代理的域
CURLOPT_HTTPPROXYTUNNEL:
CURLOPT_BUFFERSIZE: libcurl的缓冲区大小(以字节为单位)
(认证)
CURLOPT_NETRC: 此参数控制你的密码,CURL_NETRC_OPTIONAL使用 ~/.netrc 文件, CURL_NETRC_IGNORED(默认):忽略文件,CURL_NETRC_REQUIRED:告诉该文件的使用所需的库,要忽略的URL信息
CURLOPT_NETRC_FILE: 指定 ~/.netrc 文件
CURLOPT_USERNAME:
CURLOPT_USERPWD:
CURLOPT_PASSWORD:
CURLOPT_PROXYUSERNAME:
CURLOPT_PROXYUSERPWD:
CURLOPT_HTTPAUTH:
CURLOPT_PROXYAUTH:

[*]CURLAUTH_BASIC: HTTP基本验证
[*]CURLAUTH_DIGEST: HTTP摘要身份验证
[*]CURLAUTH_DIGEST_IE:
[*]CURLAUTH_GSSNEGOTIATE: Kerberos5认证 要建立GSS - API
[*]CURLAUTH_NTLM: NTLM身份验证
[*]CURLAUTH_ANY: 设置所有选项,ibcurl自动选择一个它认为合适的,安全的验证<
[*]CURLAUTH_ANYSAFE: 设置基本选项....
[*]CURLAUTH_ONLY: 强制所有请求使用验证

getinfo


CURLINFO_RESPONSE_CODE: 获得最后收到的HTTP或FTP的代码,如200,404,403,505 代理的CONNECT响应要参考 CURLINFO_HTTP_CONNECTCODE
CURLINFO_EFFECTIVE_URL: 最后一次使用有效的URL
CURLINFO_HTTP_CONNECTCODE : 长期接受最后收到的代理响应代码
CURLINFO_FILETIME:
CURLINFO_TOTAL_TIME:
CURLINFO_CONNECT_TIME:
CURLINFO_NUM_CONNECTS: 多少个连接
CURLINFO_CONTENT_TYPE: 例:text/html
CURLINFO_REQUEST_SIZE:
CURLINFO_HEADER_SIZE:
CURLINFO_SIZE_DOWNLOAD: 下载总字节量
CURLINFO_SIZE_UPLOAD:
CURLINFO_HTTPAUTH_AVAIL: 接收掩码表明身份验证
CURLINFO_PROXYAUTH_AVAIL: 接收掩码表明代理身份验证
CURLINFO_COOKIELIST:
部份使用 INFO_ 如:INFO_COOKIELIST
一个粗糙的共用对象的采集示例

Python










01
import pycurl








02
import StringIO








03
import string








04
import random








05
class spider:








06
    def __init__(self,addHeader = []):








07
      self.httpheader = [








08
             'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'








09
            #,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'








10
      ] + addHeader








11
      self.curl = pycurl.Curl()








12
      self.curl.setopt(pycurl.HTTPHEADER, self.httpheader)








13
      self.curl.setopt(pycurl.REFERER, 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+self.rand_str())








14
      #self.curl.setopt(pycurl.AUTOREFERER, 1)








15
      self.curl.setopt(pycurl.FOLLOWLOCATION, 1)








16
      self.curl.setopt(pycurl.MAXREDIRS, 5)








17









18
    def __del__(self):








19
      pass








20









21
    def rand_str(self):








22
      return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))








23









24
    def tofile(self,url,filename):








25
      fp = open(filename, 'w');








26
      self.curl.setopt(pycurl.URL, url)








27
      self.curl.setopt(pycurl.WRITEFUNCTION, fp.write)








28
      self.curl.perform()








29
      fp.close()








30
      return True








31









32
    def html(self, url):








33
      sio = StringIO.StringIO()








34
      self.curl.setopt(pycurl.URL, url)








35
      self.curl.setopt(pycurl.WRITEFUNCTION, sio.write)








36
      self.curl.perform()








37
      reval = sio.getvalue()








38
      sio.close()








39
      return reval








40









41
if __name__ == "__main__":








42
    get = spider(['USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'])








43
    print get.html("http://localhost/spider_for_test.php")








44
    print get.tofile("http://localhost/spider_for_test.php",r'E:\WebSite\wwwroot\test.txt')



一个多线程的采集示例

Python










01
import pycurl








02
import threading








03
import StringIO








04
import string








05
import random








06
class spider:








07
    def __init__(self,referer='',httpheader = []):








08
      self.httpheader = [








09
             'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'








10
            ,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'








11
      ] + httpheader








12
      self.referer = referer








13
    def __del__(self):








14
      pass








15









16
    def fetch(self,url,stream):








17
      curl = pycurl.Curl()








18
      curl.setopt(pycurl.HTTPHEADER, self.httpheader)








19
      if self.referer == '':








20
            curl.setopt(pycurl.AUTOREFERER, 1)








21
      else:








22
            curl.setopt(pycurl.REFERER, self.referer)








23
      curl.setopt(pycurl.FOLLOWLOCATION, 1)








24
      curl.setopt(pycurl.MAXREDIRS, 5)








25
      curl.setopt(pycurl.URL, url)








26
      curl.setopt(pycurl.WRITEFUNCTION, stream.write)








27
      curl.perform()








28
      curl.close()








29









30
    def rand_str(self):








31
      return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))








32









33
    def tofile(self,url,filename):








34
      fp = open(filename, 'w');








35
      self.fetch(url,fp)








36
      fp.close()








37
      return True








38









39
    def html(self, url):








40
      sio = StringIO.StringIO()








41
      self.fetch(url,sio)








42
      reval = sio.getvalue()








43
      sio.close()








44
      return reval








45









46
def gethtml(url,get):








47
    print get.html(url)








48









49
if __name__ == "__main__":








50
    import time,datetime








51
    dstart = datetime.datetime.now()








52
    get = spider()








53
    get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()








54
    thread_pool = []








55
    acc = Account(100)








56
    for i in range(10):








57
      url = "http://localhost/test.php?n="+str(i)








58
      th = threading.Thread(target=gethtml,args=(url,get))








59
      thread_pool.append(th)








60
    for i in range(10):








61
      thread_pool.start()








62
    for i in range(10):








63
      threading.Thread.join(thread_pool)








64
    dend = datetime.datetime.now()








65
    print "Time span:" , dend-dstart;



WDPYSPIDER类(支持,多线程,代理,登陆验证,POST)

Python










001
#coding:utf-8








002
import pycurl








003
import urllib








004
import threading








005
import StringIO








006
import string








007
import random








008
class spider:








009
    '''WDPYSPIDER(Whiledo Python Spider Class) 采集类








010









011
    @author HzqGhost admin@whiledo.com QQ:313143468








012
    get = spider()








013
    get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()








014
    get.proxyuse = True








015
    get.proxyip = ['059148233056.ctinets.com:80']








016
    url = "http://www.whiledo.com"








017
    print get.html(url=url)'''








018
    def __init__(self):








019
      self.httpheader = [








020
             'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'








021
            ,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'








022
      ] #http头信息








023
      self.referer = '' #伪造来源路径








024
      self.connnecttimeout = 60 #获取联接超时(秒)








025
      self.timeout = 300 #读定超时(秒)








026
      self.backheader = 0 #是否返回服务器http头信息(一般用于测试)








027
      self.cookesfile = "./cookes.dat" #cookesfile 自动读写处理文件








028
      self.proxyuse = False#是否使用代理服务器








029
      self.proxyip = []#代理服务器列表,随机使用列表中的IP








030
      self.proxynodomain = ['localhost','127.0.0.1']#不使用代理服务器的域








031
      self.http200alias = [] #200返回信息别名列表








032
      self.error = 'WDPYERROR' #非200状态时返回的错误标识








033
    def __del__(self):








034
      pass








035









036
    def fetch(self,url,stream, post={}):








037
      '''








038
      --url








039
      --stream StringIO or fp








040
      --post {'username':'hzq','password':'blog'}'''








041
      curl = pycurl.Curl()








042
      curl.setopt(pycurl.CONNECTTIMEOUT, self.connnecttimeout)








043
      curl.setopt(pycurl.TIMEOUT, self.timeout)








044
      curl.setopt(pycurl.HTTPHEADER, self.httpheader)








045
      curl.setopt(pycurl.HTTP200ALIASES, self.http200alias)








046
      curl.setopt(pycurl.HEADER, self.backheader)








047
      curl.setopt(pycurl.FOLLOWLOCATION, 1)








048
      curl.setopt(pycurl.MAXREDIRS, 5)








049
      if self.referer == '':








050
            curl.setopt(pycurl.AUTOREFERER, 1)








051
      else:








052
            curl.setopt(pycurl.REFERER, self.referer)








053
      curl.setopt(pycurl.COOKIEJAR, self.cookesfile)








054
      curl.setopt(pycurl.COOKIEFILE, self.cookesfile)








055
      curl.setopt(pycurl.WRITEFUNCTION, stream.write)








056
      curl.setopt(pycurl.URL, url)








057
      if self.proxyuse:








058
            proxyip = self.proxyip;








059
            curl.setopt(pycurl.PROXY, proxyip)








060
            #curl.setopt(pycurl.PROXYNO, self.proxynodomain) #需要7.19.4 的pycurl版本








061
      if len(post)>0 :








062
            curl.setopt(pycurl.POSTFIELDS, post)








063
      status = ''








064
      try:








065
            curl.perform()








066
            status = curl.getinfo(pycurl.RESPONSE_CODE)








067
      except:








068
            status = curl.errstr()








069
      finally:








070
            curl.close()








071
            status = str(status);








072
            if status != '200':








073
                status = self.error








074
            return status;








075









076
    def rand_str(self):








077
      return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))








078









079
    def tofile(self,url,filename, post={}):








080
      fp = open(filename, 'wb');








081
      self.fetch(url,fp,post)








082
      fp.close()








083
      return True








084









085
    def html(self, url, post={}):








086
      sio = StringIO.StringIO()








087
      reval = self.fetch(url,sio, post)








088
      if reval == '200':








089
            reval = sio.getvalue()








090
      sio.close()








091
      return reval








092









093
def gethtml(url,get):








094
    print get.html(url)








095









096
if __name__ == "__main__":








097
    get = spider()








098
    get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()








099
    get.proxyuse = True








100
    get.proxyip = ['059148233056.ctinets.com:80']








101
    url = "http://www.whiledo.com"








102
    print get.html(url=url)


页: [1]
查看完整版本: 黄聪:Python网站采集功能(多线程的采集、WDPYSPIDER类、pycurl)