|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import urllib.request
def gettext(url,page):
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
data=urllib.request.urlopen(url).read().decode("utf-8")
userpat='<h2>(.*?)</h2>'
textpat='<div class="content">(.*?)</div>'
userlist=re.compile(userpat,re.S).findall(data)
textlist=re.compile(textpat,re.S).findall(data)
dictionary=dict(zip(userlist,textlist))
x=1
for key,value in dictionary.items():
value=value.replace("\n","")
value=value.replace("<span>","")
value=value.replace("</span>","")
value=value.replace("<br/>","\n")
print("第"+str(page)+"页"+str(x)+"用户"+key)
print("内容:"+value)
print('\n')
print("-----------------------------")
x+=1
for i in range(1,3):
url="https://www.qiushibaike.com/8hr/page/"+str(i)
gettext(url,i) |
|
|