Python网页爬虫
1:#! /usr/bin/env python
# -*- coding: utf-8 -*-
#filename:splider.py
#author:wfu(fuweilin@hotmail.com)
from spdUtility import PriorityQueue,Parser
import urllib2
import sys
import os
def updatePriQueue( priQueue, url ):
"更新优先级队列"
extraPrior = url.endswith('.html') and 2 or 0 #这里优先下载以html结尾的url
extraMyBlog = 'www.kgblog.net' in url and 5 or 0 #优先抓取含有指定内容的网页,竞价抓取排名??
item = priQueue.getitem(url)
if item :
newitem = ( item+1+extraPrior+extraMyBlog, item )
priQueue.remove(item)
priQueue.push( newitem )
else :
priQueue.push( (1+extraPrior+extraMyBlog,url) )
def getmainurl(url):
"获得该url的主站地址,用于添加在相对url地址的开头"
ix = url.find('/',len('http://') )
if ix > 0 :
return url[:ix]
else :
return url
def analyseHtml(url,html, priQueue,downlist):
"分析html的超链接,并更新优先级队列"
p = Parser()
try :
p.feed(html)
p.close()
except:
return
mainurl = getmainurl(url)
for k, v in p.anchors.items():
for u in v :
if not u.startswith('http://'):#处理相对地址的url
u = mainurl + u
if not downlist.count(u) : #如果该url已经下载,就不处理了
updatePriQueue( priQueue, u )
def downloadUrl(id, url, priQueue , downlist,downFolder):
"下载指定url内容,并分析html超链接"
downFileName = downFolder+'/%d.html' % (id,)
print 'downloading',url,'as', downFileName ,
try:
fp = urllib2.urlopen(url)
except:
print '[ failed ]'
return False
else :
print '[ success ]'
downlist.push( url )#把已下载的url添加到列表中
op = open(downFileName,"wb")
html = fp.read()
unicode(html,"gb18030","ignore").encode("utf8");
op.write( html )
op.close()
fp.close()
analyseHtml(url,html,priQueue,downlist)
return True
def spider(beginurl, pages,downFolder):
"爬虫主程序,循环从优先级队列中取出最高优先级的结点处理"
priQueue = PriorityQueue()
downlist = PriorityQueue() #已下载url的集合,防止重复下载
priQueue.push( (1,beginurl) )
i = 0
while not priQueue.empty() and i < pages :
k, url = priQueue.pop()
if downloadUrl(i+1, url, priQueue , downlist,downFolder):
i += 1
print '\nDownload',i,'pages, Totally.'
def main():
"主函数,设定相关参数:开始url,抓取的网页数目,保存的文件夹"
beginurl = 'http://www.csdn.net'#开始抓取的URL地址
pages = 10 #抓取网页的数目
downloadFolder = './down' #指定保存网页的文件夹
if not os.path.isdir( downloadFolder ):
os.mkdir( downloadFolder )
spider( beginurl, pages, downloadFolder)
if __name__ == '__main__':
main()
2:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#filename:spdUtility.py
#author:wfu(fuweilin@hotmail.com)
import bisect
import string
import htmllib
import formatter
class PriorityQueue(list):
"优先级队列,用于存储url,及它的优先级"
def __init__(self):
list.__init__(self)
self.map={}
def push(self, item):
#按顺序插入,防止重复元素;若要按升序排列,可使用bisect.insort_left
ifself.count(item)==0:
bisect.insort(self,item)
self.map]=item
def pop(self):
r=list.pop(self)
delself.map]
returnr
def getitem(self,url):
ifself.map.has_key(url):
returnself.map
else:
returnNone
def empty(self):
returnlen(self)==0
def remove(self,item):
list.remove(self,item)
delself.map]
def count(self,item):
if len(self)==0:
return0
#二分查找
left = 0
right =len(self)-1
mid=-1
whileleft<=right:
mid=(left+right)/2
ifself<item:
left=mid+1
elifself>item:
right=mid-1
else:
break
returnself==itemand1or0
class Parser(htmllib.HTMLParser):
#HTML分析类
def__init__(self,verbose=0):
self.anchors={}
f=formatter.NullFormatter()
htmllib.HTMLParser.__init__(self,f,verbose)
defanchor_bgn(self,href,name,type):
self.save_bgn()
self.anchor=href
defanchor_end(self):
text=string.strip(self.save_end())
ifself.anchorandtext:
self.anchors=self.anchors.get(text,[])+
def main():#justfortest
pq=PriorityQueue()
#additemsoutoforder
pq.push((1,'http://www.baidu.com'))
pq.push((2,'http://www.sina.com'))
pq.push((3,'http://www.google.com'))
pq.push((1,'http://www.163.com'))
item=pq.getitem('http://www.sina.com')
printitem
printpq.count(item)
pq.remove(item)
printpq.count(item)
#printqueuecontents
whilenotpq.empty():
printpq.pop()
if __name__ == '__main__':
main()
页:
[1]