Python写的简易采集爬虫(蜘蛛)

依然饭跑跑 发表于 2015-4-23 07:39:40

#!/usr/bin/python
#-*-coding:utf-8-*-

# 简易采集爬虫
# 1.采集Yahoo!Answers，parseData函数修改一下，可以采集任何网站
# 2.需要sqlite3或者pysqlite支持
# 3.可以在DreamHost.com空间上面运行
# 4.可以修改User-Agent冒充搜索引擎蜘蛛
# 5.可以设置暂停的时间，控制采集速度
# 6.采集Yahoo会被封IP数小时，所以这个采集用处不大
# Author: Lukin
# Date: 2008-09-25

# 导入采集需要用到的模块
import re, sys, time
import httplib, os.path as osp
from urlparse import urlparse
# 使用sqite数据库，为了兼容DreamHost.com的空间，只能这么写了
try :
import sqlite3 as sqlite
except ImportError:
from pysqlite2 import dbapi2 as sqlite

# 采集速度控制，单位秒
sleep = 0
# 数据库路径
dbname = './database.db'
# 设置提交的header头
headers = {"Accept": "*/*","Referer": "http://answers.yahoo.com/","User-Agent": "Mozilla/5.0+(compatible;+Googlebot/2.1;++http://www.google.com/bot.html)"}
# 连接服务器
dl = httplib.HTTPConnection('answers.yahoo.com')
# 连接数据库
conn = sqlite.connect(osp.abspath(dbname))

# 创建数据库
def createDatabase():
global conn,dbname;
if osp.isfile(osp.abspath(dbname)) : return
c = conn.cursor()
# 创建url列表存放表
c.execute('''CREATE TABLE IF NOT EXISTS ( INTEGER PRIMARY KEY, TEXT, INTEGER DEFAULT '0',UNIQUE());''')
c.execute('''CREATE INDEX IF NOT EXISTS ON ();''')
# 创建分类表
c.execute('''CREATE TABLE IF NOT EXISTS ( INTEGER PRIMARY KEY, TEXT, TEXT, INTEGER DEFAULT '0', INTEGER DEFAULT '0',UNIQUE());''')
c.execute('''CREATE INDEX IF NOT EXISTS ON ();''')
c.execute('''CREATE INDEX IF NOT EXISTS ON ();''')
# 创建文章表
c.execute('''CREATE TABLE IF NOT EXISTS ( INTEGER PRIMARY KEY, INTEGER DEFAULT '0', INTEGER DEFAULT '0', TEXT, TEXT, TEXT, TEXT, TEXT,UNIQUE());''')
c.execute('''CREATE INDEX IF NOT EXISTS ON ();''')
# 事物提交
conn.commit()
c.close()

# 执行采集
def collect(url="http://answers.yahoo.com/"):
global dl,error,headers; R = 0
print "GET:",url
urls = urlparse(url); path = urls;
if urls!='' : path += '?' + urls
dl.request(method="GET", url=path, headers=headers); rs = dl.getresponse()
if rs.status==200 :
   R = parseData(rs.read(),url);
else :
   print "3 seconds, try again ..."; time.sleep(3)
   dl.request(method="GET", url=path, headers=headers); rs = dl.getresponse()
   if rs.status==200 :
         R = parseData(rs.read(),url);
   else :
         print "3 seconds, try again ..."; time.sleep(3)
         dl.request(method="GET", url=path, headers=headers); rs = dl.getresponse()
         if rs.status==200 :
            R = parseData(rs.read(),url);
         else :
            print "Continue to collect ..."
            R = 3
# 更新记录
updateOneUrl(url,R)
# 返回结果
return R

# 处理采集到的数据
def parseData(html,url):
global dl,conn; R = 2;
c = conn.cursor()
# 格式化html代码
format = formatURL(clearBlank(html),url)
# 取出所有的连接
urls = re.findall(r'''(]*?href="([^"]+)"[^>]*?>)|(]*?href='([^']+)'[^>]*?>)''',format,re.I)
if urls != None :
   i = 0
   # 循环所有的连接
   for regs in urls :
         # 得到一个单一的url
         sUrl = en2chr(regs.strip())
         # 判断url是否符合规则，符合，则插入数据库
         if re.search('http(.*?)/(dir|question)/index(.*?)',sUrl,re.I) != None :
            if re.search('http(.*?)/dir/index(.*?)',sUrl,re.I) != None:
               if sUrl.find('link=list') == -1 and sUrl.find('link=over') == -1 :
                     sUrl+= '&link=over'
               else:
                     sUrl = sUrl.replace('link=list','link=over')
            if sUrl[-11:]=='link=mailto' : continue
            try :
               c.execute('INSERT INTO ()VALUES(?);',(sUrl,))
               i = i + 1
            except sqlite.IntegrityError :
               pass
   if i>0 : print "Message: %d get a new URL." % (i,)

# 截取数据
if re.search('http(.*)/question/index(.*)',url,re.I) != None :
   sortfoot = 0
   # 自动创建分类和分类关系
   guide= sect(format,'','','((.*?)Home(.*?))')
   aGuide = re.findall(']*href="[^"]*"[^>]*>(.*?)',guide,re.I)
   if aGuide != None :
         sortname = ""
         for sortname in aGuide :
            sortname = sortname.strip()
            sortpath = en2path(sortname)
            # 查询分类是否存在
            c.execute('SELECT , FROM WHERE =? LIMIT 0,1;',(sortpath,))
            row = c.fetchone();
            # 分类不存在，添加分类
            if row==None :
               c.execute('INSERT INTO (,,)VALUES(?,?,?);',(sortname,sortpath,sortfoot))
               sortfoot = c.lastrowid
            else:
               sortfoot = row
         # 标题
         title = sect(format,'','')
         # 最佳答案
         BestAnswer = sect(format,'(Best Answer(.*?)(.*?))','()')
         # 最佳答案不存在，则不采集
         if BestAnswer != None :
            # 文章路径
            path = en2path(sortname + '-' + title.strip())
            # 问题
            adddata = sect(format,'','')
            content = sect(format,'((.*?))','()')
            if adddata != None : content += '' + adddata
            # 其他回答
            OtherAnswer = ''
            for regs in re.findall('(.+?)',format):
               if regs.find('') == -1 and regs.find('') == -1 :
                     a1 = sect(regs,'','')
                     a2 = sect(regs,'','')
                     OtherAnswer+= '' + a1
                     if a2 != None : OtherAnswer+= '' + a2 + ''
                     OtherAnswer+= ''

            # 判断采集成功
            if title != None and content != None :
               # 将数据写入到数据
               try :
                     c.execute('INSERT INTO (,,,,,)VALUES(?,?,?,?,?,?);',(sortfoot,title,path,content,BestAnswer,OtherAnswer))
                     print "Message：%s.html" % (path,)
                     R = 1
               except sqlite.IntegrityError :
                     pass
# 提交写入数据库
conn.commit(); c.close()
return R

# 取得一条URL
def getOneUrl():
global conn; c = conn.cursor()
c.execute('SELECT FROM WHERE IN(0,3) LIMIT 0,1;')
row = c.fetchone(); c.close()
if row==None : return ""
return row.encode('utf-8')

# 更新一条记录的状态
def updateOneUrl(url,state):
global conn; c = conn.cursor()
c.execute('UPDATE SET =? WHERE =?;',(state,url))
conn.commit(); c.close()

# 清除html代码里的多余空格
def clearBlank(html):
if len(html) == 0 : return ''
html = re.sub('\r|\n|\t','',html)
while html.find("")!=-1 or html.find(' ')!=-1 :
   html = html.replace(' ',' ').replace('',' ')
return html

# 格式化url
def formatURL(html,url):
urls = re.findall('''(]*?href="([^"]+)"[^>]*?>)|(]*?href='([^']+)'[^>]*?>)''',html,re.I)
if urls == None : return html
for regs in urls :
   html = html.replace(regs,matchURL(regs,url))
return html

# 格式化单个url
def matchURL(tag,url):
urls = re.findall('''(.*)(src|href)=(.+?)( |/>|>).*|(.*)url\(([^\)]+)\)''',tag,re.I)
if urls == None :
   return tag
else :
   if urls == '' :
         urlQuote = urls
   else:
         urlQuote = urls

if len(urlQuote) > 0 :
   cUrl = re.sub('''['"]''','',urlQuote)
else :
   return tag

urls = urlparse(url); scheme = urls;
if scheme!='' : scheme+='://'
host = urls; host = scheme + host
if len(host)==0 : return tag
path = osp.dirname(urls);
if path=='/' : path = '';
if cUrl.find("#")!=-1 : cUrl = cUrl[:cUrl.find("#")]
# 判断类型
if re.search('''^(http|https|ftp):(//|\\\\)(([\w/\\\+\-~`@:%])+\.)+([\w/\\\.\=\?\+\-~`@':!%#]|(&)|&)+''',cUrl,re.I) != None :
   # http开头的url类型要跳过
   return tag
elif cUrl[:1] == '/' :
   # 绝对路径
   cUrl = host + cUrl
elif cUrl[:3]=='../' :
   # 相对路径
   while cUrl[:3]=='../' :
         cUrl = cUrl
         if len(path) > 0 :
            path = osp.dirname(path)
elif cUrl[:2]=='./' :
   cUrl = host + path + cUrl
elif cUrl.lower()[:7]=='mailto:' or cUrl.lower()[:11]=='javascript:' :
   return tag
else :
   cUrl = host + path + '/' + cUrl
R = tag.replace(urlQuote,'"' + cUrl + '"')
return R

# html代码截取函数
def sect(html,start,end,cls=''):
if len(html)==0 : return ;
# 正则表达式截取
if start[:1]==chr(40) and start[-1:]==chr(41) and end[:1]==chr(40) and end[-1:]==chr(41) :
   reHTML = re.search(start + '(.*?)' + end,html,re.I)
   if reHTML == None : return
   reHTML = reHTML.group()
   intStart = re.search(start,reHTML,re.I).end()
   intEnd = re.search(end,reHTML,re.I).start()
   R = reHTML
# 字符串截取
else :
   # 取得开始字符串的位置
   intStart = html.lower().find(start.lower())
   # 如果搜索不到开始字符串，则直接返回空
   if intStart == -1 : return
   # 取得结束字符串的位置
   intEnd = html.lower().find(end.lower())
   # 如果搜索不到结束字符串，也返回为空
   if intEnd == -1 : return
   # 开始和结束字符串都有了，可以开始截取了
   R = html
# 清理内容
if cls != '' :
   R = clear(R,cls)
# 返回截取的字符
return R

# 正则清除
def clear(html,regexs):
if regexs == '' : return html
for regex in regexs.split(chr(10)):
   regex = regex.strip()
   if regex != '' :
         if regex[:1]==chr(40) and regex[-1:]==chr(41):
            html = re.sub(regex,'',html,re.I|re.S)
         else :
            html = html.replace(regex,'')
return html

# 格式化为路径
def en2path(enStr):
return re.sub('[\W]+','-',en2chr(enStr),re.I|re.U).strip('-')

# 替换实体为正常字符
def en2chr(enStr):
return enStr.replace('&','&')

# ------------------------------------- 开始执行程序 -------------------------------------------

# 首先创建数据库
createDatabase()

# 开始采集
loops = 0
while True:
if loops>0 :
   url = getOneUrl()
   if url == "" :
         loops = 0
   else :
         loops = collect(url)
else :
   loops = collect()
# 暂停
time.sleep(sleep)
if loops==0 : break

# 关闭HTTP连接
dl.close()
# 退出程序
sys.exit()

页: [1]

运维网's Archiver

Python写的简易采集爬虫(蜘蛛)