程序 python 抓取新浪读书频道小说
版权声明请尊重原创作品。转载请保持文章完整性,并以超链接形式注明原始作者“tingsking18”和主站点地址,方便其他朋友提问和指正。
二进制文件下载地址:
SinaGetBook
效果如图:
http://beta.hi.csdn.net/attachment/200909/7/1207120_1252366646Kl60.jpg
代码:
#!/usr/bin/env python#coding=utf-8#!/usr/bin/env python#coding=utf-8import tracebackimport sysimport wximport reimport urllibimport wx.richtext as rtimport wx.lib.buttonpanel as bpimport Casingimport Debugdef trace_back(): try: return traceback.print_exc() except: return ''class Window(wx.Frame):def __init__(self):sys.setdefaultencoding("utf-8")wx.Frame.__init__(self,None,-1,u'新浪网图书频道抓取工具',pos=wx.Point(0, 0),size=(800,620))l1 = wx.StaticText(self, -1, u"目录URL:")self.t1 = wx.TextCtrl(self, -1, "http://vip.book.sina.com.cn/book/?book=27633", size=(500, -1))l2 = wx.StaticText(self, -1, u"内容URL前缀:")self.t2 = wx.TextCtrl(self, -1, "http://vip.book.sina.com.cn/book/", size=(500, -1))l3 = wx.StaticText(self, -1, u"替换的内容:")self.t3 = wx.TextCtrl(self, -1,u"阅读‘刘猛’的其他作品: \n"u"http://vip.book.sina.com.cn/book/?book=39011《狼牙》作者新作:冰是睡着的水\n"u"http://vip.book.sina.com.cn/book/?book=41217刘猛展示狙击手神秘生活:刺客\n"u"http://vip.book.sina.com.cn/book/?book=38884中国特种部队生存实录:狼牙\n"u"http://vip.book.sina.com.cn/book/?book=43226刘猛最新力作:如临大敌",size=(500, 100), style=wx.TE_MULTILINE|wx.TE_PROCESS_ENTER)self.t3.SetInsertionPoint(0)l4 = wx.StaticText(self, -1, u"内容")#self.t4 = wx.TextCtrl(self, -1,"",# size=(600, 400), style=wx.TE_MULTILINE|wx.TE_PROCESS_ENTER)self.t4 = rt.RichTextCtrl(self,-1,"",size=(600, 400), style=wx.VSCROLL|wx.HSCROLL|wx.NO_BORDER);#self.t4.SetInsertionPoint(0)self.b = wx.Button(self, -1, u"开始抓取")self.Bind(wx.EVT_BUTTON, self.OnTestReplace, self.b)space = 2bsizer = wx.BoxSizer(wx.VERTICAL)bsizer.Add(self.b, 0, wx.GROW|wx.ALL, space)sizer = wx.FlexGridSizer(cols=3, hgap=space, vgap=space)sizer.AddMany([ l1, self.t1, (0,0),l2, self.t2, (0,0),l3, self.t3, bsizer,l4, self.t4, (0,0),])border = wx.BoxSizer(wx.VERTICAL)border.Add(sizer, 0, wx.ALL, 15)self.SetSizer(border)self.SetAutoLayout(True)self.Show(True)def OnTestReplace(self, evt):#dlg = wx.MessageDialog(None, u'Data file is not exist,please download it!',u'Error',wx.OK | wx.ICON_INFORMATION)#dlg.ShowModal()#dlg.Destroy()listurl = self.t1.GetValue()prefix = self.t2.GetValue()#print prefixreplace = self.t3.GetValue()#print replace.decode("utf-8").encode("GBK")rep = replace.split("\n")def f():try:sock = urllib.urlopen(listurl) strhtml = sock.read() strhtml = unicode(strhtml, 'gb2312','ignore').encode('utf-8','ignore') strhtml =strhtml.lower()list = re.findall('''<a href="(chapter_+.*?)" target="_blank">''', strhtml) for one in list:try:sock1 = urllib.urlopen(prefix+one) htmlcontent = sock1.read() htmlcontent = unicode(htmlcontent, 'gb2312','ignore').encode('utf-8','ignore') title = re.findall('''<h1>(.*?)</h1>''', htmlcontent) s_content = re.findall('''<div id="contTxt" class="contTxt1"><p>([\s\S]*?)</p></div>''', htmlcontent)s_content = s_content.replace("<p>","")s_content = s_content.replace("</p>","")s_content = s_content.replace("*","")for reps in rep:s_content = s_content.replace(reps.decode("utf-8"),"")#print title.decode("utf-8").encode("GBK")#print s_content.decode("utf-8").encode("GBK")self.b.SetLabel(u"test")self.t4.AppendText(title.decode("utf-8").encode("GBK")+"\n")self.t4.AppendText(s_content.decode("utf-8").encode("GBK"))except:Debug.error.traceback()continue;except:Debug.error.traceback()d = Casing.Casing(f)d.start_thread()application = wx.PySimpleApp()Window()application.MainLoop()
页:
[1]