Python 爬虫爬取微信文章

ABKYH 发表于 2018-8-10 08:53:21

#!/usr/bin/env python　　
# -*- coding: utf-8 -*-
　　

　　
import re
　　
import urllib.request
　　
import time
　　
import urllib.error
　　

　　
##模拟浏览器安装headers
　　
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36")
　　
opener=urllib.request.build_opener()
　　
opener.addheaders=
　　
urllib.request.install_opener(opener)
　　
##设置列表用于存储链接
　　
listurl=[]
　　

　　
##定义代理服务器函数
　　
#def use_proxy(proxy_addr,url):
　　
#try:
　　
#import urllib.request
　　
#proxy=urllib.request.ProxyHandler({'http':proxy_addr})
　　
#opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
　　
#urllib.request.install_opener(opener)
　　
#data=urllib.request.urlopen(url).read().decode('utf-8')
　　
#data=str(data)
　　
#return data
　　
#except urllib.error.URLError as e:
　　
#if hasattr(e,"code"):
　　
#print(e.code)
　　
#if hasattr(e,"reason"):
　　
#print(e.reason)
　　
#time.sleep(10)
　　
#except Exception as e:
　　
#print("exception"+str(e))
　　
#time.sleep(1)
　　
##定义获取页面所有文章链接
　　
def getlisturl(key,pagestart,pageend):
　　try:
　　page=pagestart
　　keycode=urllib.request.quote(key)
　　
#pagecode=urllib.request.quote("&page")
　　for page in range(pagestart,pageend+1):
　　url="http://weixin.sogou.com/weixin?type=2&query="+keycode+"&page="+str(page)
　　data1=urllib.request.urlopen(url).read().decode('utf-8')
　　data1=str(data1)
　　listurlpat='<a data-z="art".*?(http://.*?)"'
　　listurl.append(re.compile(listurlpat,re.S).findall(data1))
　　time.sleep(2)
　　print("共获取到"+str(len(listurl))+"页")
　　print("第2页链接数"+str(len(listurl))+"个")
　　return listurl
　　except urllib.error.URLError as e:
　　if hasattr(e,"code"):
　　print(e.code)
　　if hasattr(e,"reason"):
　　print(e.reason)
　　time.sleep(10)
　　except Exception as e:
　　print("exception"+str(e))
　　time.sleep(1)
　　

　　
##定义获取文章内容
　　
def getcontent(listurl):
　　i = 0
　　#设置本地文件中的开始html编码
　　html1 = '''
　　<!DOCTYPE html>
　　<html>
　　<head>
　　<meta http-equiv="Content-Type" content="text/html; " />
　　<title>微信文章页面</title>
　　</head>
　　<body>
　　'''
　　fh=open("/home/urllib/test/1.html","wb")
　　fh.write(html1.encode("utf-8"))
　　fh.close()
　　#再次以追加写入的方式打开文件，以写入对应文章内容
　　fh=open("/home/urllib/test/1.html","ab")
　　for i in range(0,len(listurl)):
　　for j in range(0,len(listurl)):
　　try:
　　url=listurl
　　url=url.replace("amp;","")
　　data=urllib.request.urlopen(url).read().decode('utf-8')
　　data=str(data)
　　titlepat='var msg_title = "(.*?)";'
　　contentpat='id="js_content">(.*?)id="js_sg_bar"'
　　title=re.compile(titlepat).findall(data)
　　content=re.compile(contentpat,re.S).findall(data)
　　#初始化标题与内容
　　thistitle = "此次没有获取到"
　　thiscontent= "此次没有获取到"
　　#如果标题列表不为空，说明找到了标题，取列表第0个元素，即此次标题赋给变量thistitle
　　if (title!=[]):
　　thistitle = title
　　if (content!=[]):
　　thiscontent = content
　　#将标题与内容汇总赋给变量dataall
　　dataall = "标题为:"+thistitle+"内容为："+thiscontent+" "
　　fh.write(dataall.encode('utf-8'))
　　print("第"+str(i)+"个网页第"+str(j)+"次处理")
　　time.sleep(1)
　　except urllib.error.URLError as e:
　　if hasattr(e,"code"):
　　print(e.code)
　　if hasattr(e,"reason"):
　　print(e.reason)
　　time.sleep(10)
　　except Exception as e:
　　print("exception"+str(e))
　　time.sleep(1)
　　fh.close()
　　html2='''</body>
　　</html>
　　'''
　　fh=open("/home/urllib/test/1.html","ab")
　　fh.write(html2.encode("utf-8"))
　　fh.close()
　　
key="科技"
　　
#proxy="122.114.31.177:808"
　　
pagestart=1
　　
pageend=3
　　
listurl=getlisturl(key,pagestart,pageend)
　　
getcontent(listurl)

页: [1]

运维网's Archiver

Python 爬虫爬取微信文章