Python网页抓取程序(续)

hb120973135 发表于 2015-4-27 11:45:33

　　继续上次的话题，这次抓取的网页是天涯论坛中，“地缘看世界”
　　1、获取网址：通过正则表达式来获取各贴子网址
　　link='http://www.tianya.cn/publicforum/content/worldlook/1/223829.shtml'
html=urllib2.urlopen(link).read()
m=re.search(r'name=\'idArticleslist\' value=\S*>',html)
IDs=re.findall(r'+',m.group(0))
　　for ID in IDs:
   url="http://www.tianya.cn/publicforum/content/worldlook/1/%s.shtml"%ID
　　2、下载网页：以前是边下载边处理，这样处理时间长，有时还有下载不了的情况，改为下载到指定目录，并在下载前检查是否存在同名
　　htmldir=r'.\html\\'
　　filename=htmldir+url.split('/')[-1]
if (not os.path.exists(filename)) or os.path.getsize(filename)==0:
   print 'downloading'+filename+'\n'
   html=urlRead(url)
   if len(html)>0:
         f=open(filename,'w')
         f.write(html)
         f.close()
　　3、下载后对网页内容进行分析，在分析前要对网页进行处理以去除htmlparser无法处理的部分，实质是对网页进行截取，并将无法处理字符串替换
　　txts=re.split(r'',html)
txt=txts
txt=re.sub('\xcb\xce\xcc\xe5','\'\xcb\xce\xcc\xe5\'', txt)
　　4、提取贴子的正文，还是正规的htmlparser的方法，但这种方法速度很慢，也可采用正则表达式的方法，但这样适应性不强。文本中也以
　　这个贴子中有大量的图片，在的形式保存
　　class DocParser(HTMLParser.HTMLParser):
def __init__(self,pool):
   self.pool=pool
   self.startread=0
   self.pre=0
   HTMLParser.HTMLParser.__init__(self)
   self.doc=''
def handle_starttag(self, tag, attrs):
   if tag=='span':
         for (name,value) in attrs:
            if name=='value' and value=='10174465':
               self.pre=1
   if tag=="div" and self.pre==1:
         for (name,value) in attrs:
            if name=='class' and value =='post':
               self.startread= 1
   if tag=='img' and self.startread==1:

         for (name,value) in attrs:
            if name=='original':

               imgname=value.split('/')[-4]+value.split('/')[-3]\
                        +value.split('/')[-2]+value.split('/')[-1]
               self.doc+='\n'%imgname
               if not os.path.exists(htmldir+imgname):
                     self.pool.add_task(getImg,value,htmldir)

def handle_endtag(self, tag):
   if tag == 'div' and self.startread==1:
         self.doc+='\n\n\n'
         self.pre=0
         self.startread = 0
def handle_data(self,data):
   if self.startread:
         self.doc+=data
         self.doc+='\n'
　　5、破解防外链接：通过设置Referer实现
　　preurl='http://www.tianya.cn/'
　　req = urllib2.Request(url)
　　req.add_header('Referer', preurl)
　　6、提高urlopen工作健壮性，设置重试次数和超时等待
　　改造后的urlopen如下：
　　def urlRead(url):
fails = 0
rs=''
preurl='http://www.tianya.cn/'
while True:
   try:

         if fails >= 100:
            print 'Failed to Read '+url
            break
         #设置Referer，避免防盗链
         req = urllib2.Request(url)
         req.add_header('Referer', preurl)
         response=urllib2.urlopen(req,timeout=30)
         length=response.info()['Content-Length']
         rs=response.read()
         if len(rs)==length:
            continue

   except Exception:
         fails += 1
         time.sleep(10)
   else:
         break

return rs
　　7、简易多线程下载：以前试用过stackless感觉没有效果，后来还是使用threadpool类，
　　from Queue import Queue
from threading import Thread
　　class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
   Thread.__init__(self)
   self.tasks = tasks
   self.daemon = True
   self.start()

def run(self):
   while True:
         func, args, kargs = self.tasks.get()
         try: func(*args, **kargs)
         except Exception, e: print e
         self.tasks.task_done()
　　class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
   self.tasks = Queue(num_threads)
   for _ in range(num_threads): Worker(self.tasks)
　　def add_task(self,func, *args, **kargs):
   """Add a task to the queue"""
   self.tasks.put(( func, args, kargs))
　　def wait_completion(self):
   """Wait for completion of all the tasks in the queue"""
   self.tasks.join()
　　使用时：
　　pool = ThreadPool(200)
　　for ID in IDs:
   url="http://www.tianya.cn/publicforum/content/worldlook/1/%s.shtml"%ID
   pool.add_task(getHtml,url,htmldir)
　　pool.wait_completion()
　　8、输出到pdf,我使用了reportlab要注意的是：
　　引用字体：
　　reportlab.rl_config.warnOnMissingFontGlyphs = 0
　　pdfmetrics.registerFont(TTFont('YaHei', 'msyh.ttf'))
pdfmetrics.registerFont(TTFont('YaHeiBD', 'msyhbd.ttf'))
fonts.addMapping('YaHei', 0, 0, 'YaHei')
fonts.addMapping('YaHei', 0, 1, 'YaHei')
fonts.addMapping('YaHeiBD', 1, 0, 'YaHeiBD')
fonts.addMapping('YaHeiBD', 1, 1, 'YaHeiBD')
stylesheet=getSampleStyleSheet()
normalStyle = copy.deepcopy(stylesheet['Normal'])
normalStyle.fontName ='YaHei'
　　其次是中文换行：
　　normalStyle.wordWrap = 'CJK'
　　中文的左缩进有bug设置左缩进后，第一行全部向右移，并不是比其他的行前面少几个字
　　最后是嵌入图片：取得图片的长和宽，然后设置缩放比，
　　def get_image(path):
width=439
height=685
img = utils.ImageReader(path)
iw, ih = img.getSize()

if iw>width or ih>height:
   rw=float(iw)/float(width)
   rh=float(ih)/float(height)
   if rw>rh:
         return Image(path,width,round(ih/rw))
   else:
         return Image(path,round(iw/rh),height)
else:
   return Image(path)
　　结论：python是处理网页的利器，但我在编码上浪费了许多时间，在htmlparser和re中对中文的编码支持也不好，不知道各达人有没有什么好方法。
　　python号称是“胶水语言”，但我始终对如何将其与其他诸如C＃，java在一起使用感到困惑。
　　多线程下载还是存在很多问题，各位在借鉴时一定要注意。

页: [1]

运维网's Archiver

Python网页抓取程序(续)