(1)提取文章的标题
为了方便操作,我们用BeautifulSoup来分析网页,对html文本我们提取title之间的内容为
<title>东望洋_韩寒_新浪博客</title>
对这个强制转化为字符串,然后进行切片操作,大致取string[7 : -28],得到了文章的标题。
[python] view plaincopy在CODE上查看代码片派生到我的代码片
from bs4 import BeautifulSoup
import re
for i in xrange(1,317):
filename = 'HanhanArticle/' + str(i) + '.html'
html = open(filename,'r')
soup = BeautifulSoup(html)
html.close()
title = soup.find('title')
string = str(title)
article = string[7 : -28].decode('utf-8')
if article[0] != '.':
print article
但是有些标题直接的内容还需要处理,比如<<ONE IS ALL>>,本来应该解释为《ONE IS ALL》
还有比如中央电视台很*很**,这里的**在文件中不能作为名称字符。
[python] view plaincopy在CODE上查看代码片派生到我的代码片
#coding:utf-8
import re
import urllib2
from bs4 import BeautifulSoup
def getPageURLs(url):
text = urllib2.urlopen(url).read()
pattern = r'<a title=".+" target="_blank" href="(http://blog.sina\.com\.cn.+\.html)">'
regex = re.compile(pattern)
urlList = re.findall(regex,text)
return urlList
def getStore(title,url):
text = urllib2.urlopen(url)
context = text.read();
text.close()
filename = 'HanhanArticle/'+ title + '.html'
f = open(filename,'w')
f.write(context)
f.close()
def getTitle(url):
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
title = soup.find('title')
string = str(title)
return string[7 : -28]
def Judge(title):
lens = len(title)
for i in xrange(0,lens):
if title == '*':
return False
return True
def getAllURLs():
urls = []
for i in xrange(1,8):
urls.append('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(i)+'.html')
for url in urls:
tmp = getPageURLs(url)
for i in tmp:
title = getTitle(i).decode('utf-8')
print title
if title[0] != '.' and Judge(title):
getStore(title,i)
if __name__ == '__main__':
getAllURLs()
提取正文部分后续再处理。。。