设为首页 收藏本站
查看: 999|回复: 0

[经验分享] python抓取所有航空公司新闻

[复制链接]

尚未签到

发表于 2017-5-4 11:27:48 | 显示全部楼层 |阅读模式
#coding=utf-8
import urllib
import time
import re
import threading
import string
import urllib2
import chardet
import MySQLdb
import base64
import os
import sys
from pyquery import PyQuery
reload(sys)
sys.setdefaultencoding("utf-8")
#新闻最早时间
earliestTime = "2010-06-01"
mutex = threading.Lock()
def printf(obj):
"""多线程的输出"""
global mutex
mutex.acquire()
print obj
mutex.release()

def mysqlDataBase_init(database="jipiao"):
"""连接mysql
返回conn,cursor"""
conn = MySQLdb.connect(host="localhost", user="root", passwd="", db=database, unix_socket="/opt/lampp/var/mysql/mysql.sock")
cursor = conn.cursor()
cursor.execute("SET NAMES utf8")
cursor.execute("SET CHARACTER_SET_CLIENT=utf8")
cursor.execute("SET CHARACTER_SET_RESULTS=utf8")
conn.commit()
return (conn, cursor)      
def countNewsUrl(cursor, newsUrl):
"count  newsUrl 个数"
selectSql = "select count(*) from jipiaonews where newsUrl='%s'" % newsUrl
cursor.execute(selectSql)
result = cursor.fetchone()
return result[0]
pass
def checkNewsUrl(cursor, newsUrl):
"""检查newsUrl
返回是否已经存在"""
#已经有的url,不需要抓取了
if countNewsUrl(cursor, newsUrl) > 0:
return 1
#多线程输出
global mutex
mutex.acquire()
print newsUrl
mutex.release()
return 0
def executeSql(cursor, title, postDate, content, airport, source, newsUrl):
"""执行sql语句"""
#替换不友好字符
content = content.replace("'", "")
content = content.replace("\"", "")
insertsql = ("insert into jipiaonews (title,postDate,content,airport,source,newsUrl)"
" values('%s','%s','%s','%s','%s','%s')"
% (title, postDate, content, airport, source, newsUrl))   
insertsql = insertsql.encode("utf-8")
cursor.execute(insertsql)   
pass
def parseDate(postDate):
"""转换日期"""
return postDate.replace("年", "-").replace("月", "-").replace("日", "")
pass
def xiamenhangkong():
"""厦门航空"""
airport = "厦门航空"
matchPostDate=re.compile(r"[\d]+-[\d]+-[\d]")
conn,cursor=mysqlDataBase_init()
page=1
while page!=-1:
newsPageUrl="http://www.xiamenair.com.cn/newslist/news-%s.shtml"%page
page+=1
try:
pq=PyQuery(newsPageUrl)
except:
break
i=0
while True:
a= pq("table.artlist a:eq(%s)"%i)
i+=1
newsUrl=a.attr("href")
if not newsUrl:
#第一个新闻都没有,则退出
if i==1:
page=-1
break
newsUrl="http://www.xiamenair.com.cn"+newsUrl
if checkNewsUrl(cursor, newsUrl):
page=-1
break
title=a.text()
tr=pq("table.artlist tr:eq(%s)"%i)
postDate = matchPostDate.search(tr.text()).group()
newpq=PyQuery(newsUrl)
content=newpq("div.artcon").text()
if content.find(")")!=-1:
content=content[content.find(")")+1:]
content=content.strip()
source=""
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
pass
conn.commit()
conn.close()
#    newsBasePage = "http://www.xiamenair.com.cn/about_news.aspx?Page=%s"
#    #由于新闻少,所以时间向前推了
#    ownEarliestTime = "2010-01-01"
#    #页码
#    pages = 1
#    while pages != -1:
#        newsBaseUrl = newsBasePage % pages   
#        pages += 1
#        
#        pq = PyQuery(newsBaseUrl)
#        table = pq("div#header table:eq(25)")
#        conn, cursor = mysqlDataBase_init()
#        index = 0
#        while True:
#            a = table.find("a:eq(%s)" % index)
#            index += 1
#
#            newsUrl = a.attr("href")
#            if not newsUrl:
#                break
#            newsUrl = "http://www.xiamenair.com.cn/" + newsUrl   
#            title = a.text()
#            if checkNewsUrl(cursor, newsUrl):
#                pages=-1#结束
#                break
#            newspq = PyQuery(newsUrl)
#            newsTable = newspq("table:eq(3)")   
#            postDate = newsTable("div:eq(1)").html()  
#            #函数终止条件
#            if postDate < ownEarliestTime:
#                pages = -1#结束
#                break  
#            content = newsTable("table:eq(1)").text()
#            source = ""
#            postDate = postDate.encode("utf8")
#            
#            executeSql(cursor, title, postDate, content, airport, source, newsUrl)           
#            pass
#        conn.commit()
#        conn.close()
#        pass   
#    pass
def chongqinghangkong():
"""重庆航空"""
airport = "重庆航空"
newsBaseUrl = "http://www.flycq.com/info/news/index.html"
pq = PyQuery(newsBaseUrl)
conn, cursor = mysqlDataBase_init()
#使用正则表达式,查找日期
patternDate = re.compile(r"[\d]{6}")
i = 0
while i!=-1:
a = pq("div#newsContent div ul li:eq(%s)" % i).find("a")
newsUrl = a.attr("href")
i += 1
if not newsUrl:
break
if newsUrl.startswith("http"):
continue
#使用正则表达式,查找日期
postDate = "20" + patternDate.search(newsUrl).group()
#需要修改个别的网址
newsUrl = newsUrl[newsUrl.rfind('/') + 1:]      
try:         
newsUrl = "http://www.flycq.com/info/news/" + newsUrl
if checkNewsUrl(cursor, newsUrl):
i=-1#结束
break
newspq = PyQuery(newsUrl)
except urllib2.HTTPError:
printf(newsUrl + "   illegal url!")
continue            
content = newspq("div#newsContent").text()
if content.find("InstanceBeginEditable") != -1:
content = content[len("InstanceBeginEditable name=\"content\" "):]
if content.find("InstanceEndEditable") != -1:
content = content[:-len("InstanceEndEditable")]
title = a.text()
source = ""
executeSql(cursor, title, postDate, content, airport, source, newsUrl)   
pass   
conn.commit()
conn.close()   
pass
def hannanhangkong():
"""海南航空"""
airport = "海南航空"
matchUrl = re.compile(r"/.*\.html")
matchPostDate = re.compile(r"[\d]{4}-[\d]+-[\d]+")
page = 0
while page != -1:
if page == 0:
newsPageUrl = "http://www.hnair.com/gyhh/hhxw/index.html"
else:
newsPageUrl = "http://www.hnair.com/gyhh/hhxw/index_%s.html" % page
page += 1
conn, cursor = mysqlDataBase_init()
pq = PyQuery(newsPageUrl)
i = 0
while True:
a = pq("div.body a:eq(%s)" % i)
newsUrl = a.attr("onclick")
if not newsUrl:
break
newsUrl = matchUrl.search(newsUrl).group()
newsUrl = "http://www.hnair.com/gyhh/hhxw" + newsUrl
li = pq("div.body li:eq(%s)" % i)
i += 1
postDate = matchPostDate.search(li.text()).group()
if postDate < earliestTime:
page = -1#结束标记
break
if checkNewsUrl(cursor, newsUrl):
page = -1#结束标记
break
title = li.text()[:-(len(postDate) + 2)]                    
newspq = PyQuery(newsUrl)
content = newspq("div.body").text()
if content.find("本报讯") != -1 and content.find(")") != -1:
content = content[content.find(")") + len(")"):]
source = ""
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
conn.commit()
conn.close()
pass
def xingfuhangkong():
"""幸福航空"""
airport = "幸福航空"
newsBasePage = "http://www.joy-air.com/AboutUs/NewsCenter/List.aspx?id=1"
pq = PyQuery(newsBasePage)
url = pq("div#newslist").find("a").attr("href")
id = int(re.search(r"[\d]+", url).group())
newsBaseUrl = "http://www.joy-air.com/AboutUs/NewsCenter/Detail.aspx?id="
conn, cursor = mysqlDataBase_init()
while True:
newsUrl = newsBaseUrl + str(id)
id -= 1
if checkNewsUrl(cursor, newsUrl):
break                  
try:
newspq = PyQuery(newsUrl)
except urllib2.HTTPError:
continue
postDate = newspq("div.dynamicmessagesbox span:eq(1)").text()
#函数终止条件
if postDate < earliestTime:        
break
content = newspq("div.dynamicmessagesbox").text()
title = newspq("div.dynamicmessagesbox span:eq(0)").text()
source = ""
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
conn.commit()
conn.close()
pass
def shenzhenhangkong():
"""深圳航空"""
airport = "深圳航空"
newsPageUrl = "http://www.shenzhenair.com/more.jsp?category=423afc988259499faa94bbaf8b6983b5&title=%E6%B7%B1%E8%88%AA%E6%96%B0%E9%97%BB&lan=1"
sourceFile = urllib.urlopen(newsPageUrl).read()
pq = PyQuery(sourceFile.decode("gbk"))
conn, cursor = mysqlDataBase_init()
newspqs = pq("table:eq(17)")
match = re.compile(r"</script>.*")
i = 0
while True:
a = newspqs("a:eq(%s)" % i)
i += 1
newsUrl = a.attr("href")
title = a.text()
if not newsUrl:
break
newsUrl = newsUrl[:newsUrl.rfind("&")]
newsUrl = "http://www.shenzhenair.com/" + newsUrl   
title = match.search(title).group()[len(r"</script>"):].strip()
if checkNewsUrl(cursor, newsUrl):
break        
sourceFile = urllib.urlopen(newsUrl).read()
newspq = PyQuery(sourceFile.decode("gbk"))
content = newspq("table:eq(2)").text()
#没有找到时间
postDate = time.strftime("%Y%m%d")
source = ""
begin = content.find("】")
begin = content.find("】", begin + 1)
end = content.rfind("【")
if begin == -1:
begin = 0
content = content[begin + 2:end]
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
if i % 20 == 0:
time.sleep(5)
pass
conn.commit()
conn.close()
pass
def chunqiuhangkong():
"""春秋航空
采用的是新闻页的,比首页的新闻还少"""
airport = "春秋航空"
newsPageUrl = "http://www.china-sss.com/Static/New_information"
conn, cursor = mysqlDataBase_init()
sourceFile = urllib.urlopen(newsPageUrl).read()
pq = PyQuery(unicode(sourceFile, "utf-8"))
i = 1
while True:
span = pq("div.New_information span:eq(%s)" % i)
a = pq("div.New_information a:eq(%s)" % i)
i += 1
newsUrl = a.attr("href")
if not newsUrl:
break
title = a.text()
postDate = span.text()
newsUrl = "http://www.china-sss.com" + newsUrl
if checkNewsUrl(cursor, newsUrl):
break            
sourceFile = urllib.urlopen(newsUrl).read()
newspq = PyQuery(unicode(sourceFile, "utf-8"))
content = newspq("dl#content_top dd").text()
source = ""
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
conn.commit()
conn.close()
pass
def dongfanghangkong():
"""东方航空"""
airport = "东方航空"
newsPageUrl = "http://www.ce-air.com/mu/main/gydh/xwgg/index.html"
index = 0
while index != -1:  
if index != 0:
newsPageUrl = ("http://www.ce-air.com/"
"mu/main/gydh/xwgg/index_%s.html" % index)
index += 1#下一个页面   
pq = PyQuery(newsPageUrl)
conn, cursor = mysqlDataBase_init()
i = 0
while True:
a = pq("div.s_xw_list ul li:eq(%s)" % i).find("a")
i += 1
href = a.attr("href")
if not href:
break
title = a.text()
newsUrl = "http://www.ce-air.com/mu/main/gydh/xwgg/" + href[2:]   
postDate = re.search(r"[\d]{8}", newsUrl).group()
if postDate < earliestTime.replace("-", ""):
index = -1#外层循环结束标记
break
if checkNewsUrl(cursor, newsUrl):
index = -1#外层循环结束标记
break                     
newspq = PyQuery(newsUrl)
content = newspq("div.body p:eq(1)").text()
if content == None:
content = newspq("div.body p:eq(0)").text()
if len(content) == 0:
content = newspq("div.body").text()
content = content[content.rfind("}") + 1:]
table = newspq("div.body table").text()           
source = re.search("'.*'", table).group()[1:-1]
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
conn.commit()
conn.close()
pass
pass
def shanghaihangkong():
"""上海航空
没有新闻,只有快讯"""
airport = "上海航空"
newsPageUrl = "http://www.shanghai-air.com/cmsweb/sale.aspx?pageIndex=0"
sourceFile = urllib.urlopen(newsPageUrl).read()
pq = PyQuery(unicode(sourceFile, "utf-8"))
conn, cursor = mysqlDataBase_init()
i = 0
while True:
a = pq("div.items li:eq(%s)" % i).find("a")
i += 1
newsUrl = a.attr("href")
title = a.attr("title")
if not newsUrl:
break
newsUrl = "http://www.shanghai-air.com/cmsweb/" + newsUrl
if checkNewsUrl(cursor, newsUrl):
break            
sourceFile = urllib.urlopen(newsUrl).read()
newspq = PyQuery(unicode(sourceFile, "utf-8"))
content = newspq("div.nowtext").text()
if not content:
content = ""
postDate = time.strftime("%Y%m%d")
source=""
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
conn.commit()
conn.close()
pass
"""上海航空旧版的新闻"""
airport = "上海航空"
newsPageUrl = "http://ww1.shanghai-air.com/news/saldt.asp?ttt="
matchPostDate = re.compile(r"[\d]+-[\d]+-[\d]+")
conn, cursor = mysqlDataBase_init()
pq = PyQuery(newsPageUrl)
i = 0
while True:
a = pq("a:eq(%s)" % i)
i += 1
newsUrl = a.attr("href")
if not newsUrl:
break
if not newsUrl.find("PublicInfo") != -1:
continue
newsUrl = "http://ww1.shanghai-air.com" + newsUrl
if checkNewsUrl(cursor, newsUrl):
break
title = a.text()[1:]
postDate = parseDate(matchPostDate.search(a.text()).group())
title = title[:-(len(postDate) + 2)]
newspq = PyQuery(newsUrl)
content = newspq("div.texttext").text()
source = ""
if not content:
continue
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
conn.commit()
conn.close()
pass
def zhongguohangkong():
"""中国航空
有时会出现urlopen error"""
airport = "中国航空"
newsPageUrl = "http://ffp.airchina.com.cn/cms/ffp/jszx/xwxx/default.html"
conn, cursor = mysqlDataBase_init()
count=0
while count!=10:
try:   
pq = PyQuery(newsPageUrl)
except urllib2.URLError:
count+=1
time.sleep(5)
else:
break   
i = 0
while True:
newsUrl = pq("a:eq(%s)" % i).attr("href")   
if not newsUrl:            
break
span = pq("span:eq(%s)" % i).text()
i += 1
postDate = re.search(r"[\d]{4}-[\d]{2}-[\d]{2}", span).group()
if postDate < earliestTime:
break
title = span[:-14]
newsUrl = "http://ffp.airchina.com.cn" + newsUrl   
if checkNewsUrl(cursor, newsUrl):
break   
count=0   
while count!=10:
try:   
newspq = PyQuery(newsUrl)
except urllib2.URLError:
count+=1
time.sleep(5)
else:
break         
content = newspq("body").text()
source = ""   
executeSql(cursor, title, postDate, content, airport, source, newsUrl)        
pass
conn.commit()
conn.close()
pass
def sichuanhangkong():
"""四川航空"""
airport = "四川航空"
newsPageUrl = "http://www.scal.com.cn/ScalB2CWeb/News/More_News.aspx?code=NC00391"
pq = PyQuery(urllib.urlopen(newsPageUrl).read())
ul = pq("ul.news_n")
conn, cursor = mysqlDataBase_init()
match = re.compile(r"[\d]{4}")
i = 0
while True:
a = pq("ul.news_n a:eq(%s)" % i)
i += 1
id = a.attr("onclick")
if not id:
break
id = match.search(id).group()
newsUrl = "http://www.scal.com.cn/Scal.WebMaster/FileUpLoad/htmlpage/%s.html"
newsUrl = newsUrl % id  
if checkNewsUrl(cursor, newsUrl):
break           
newspq = PyQuery(newsUrl)
title = newspq("span.NewsTitle").text()
postDate = newspq("span.NewsInfo_Time").text()
postDate = parseDate(postDate)[5:]
source = newspq("span.NewsInfo_Publisher").text()
content = newspq("div.NewsContent").text()
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
conn.commit()
conn.close()
pass
def tianjinhangkong():
"""天津航空"""
airport = "天津航空"
#天津航空官网上的的新闻
url = "http://www.tianjin-air.com/index/ann!doAjaxQuery.action"
pq = PyQuery(urllib.urlopen(url).read())
match = re.compile("'.*'")
conn, cursor = mysqlDataBase_init()
i = 0
while True:
a = pq("a:eq(%s)" % i)
i += 1
title = a.attr("title")
newsUrl = a.attr("href")
if not newsUrl:
break
newsUrl = match.search(newsUrl).group()[1:-1]
newsUrl = "http://www.tianjin-air.com" + newsUrl
if checkNewsUrl(cursor, newsUrl):
break            
content = urllib.urlopen(newsUrl).read()
begin = content.find("Date:")
postDate = content[begin:content.find("\n", begin)][6:-6].strip()
try:
postDate = time.strftime("%Y-%m-%d", time.strptime(postDate, "%a, %m %b %Y %H:%M:%S"))
except:
matchp=re.search(r"[\d]+-[\d]+-[\d]+", postDate)
if matchp:
postDate=matchp.group()
else:
postDate=time.strftime("%Y%m%d")
content = content[content.find("\n", content.find("X-MimeOLE")):].strip()
#若是base64需要解码
content = base64.decodestring(content)  
try:
newspq = PyQuery(content)
except:
continue
content = newspq("body").text()
if not content:
continue
source=""
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
conn.commit()
conn.close()
pass
def shandonghangkong():
"""山东航空"""
airport = "山东航空"
headerString = """Accept:*/*
Accept-Charset:GBK,utf-8;q=0.7,*;q=0.3
Accept-Encoding:gzip,deflate,sdch
Accept-Language:zh-CN,zh;q=0.8
Connection:keep-alive
Content-Length:176
Content-Type:application/x-www-form-urlencoded
Cookie:JSESSIONID=0000Dva3t-EvL-J6jQ5uEa8YppU:-1; GUEST_LANGUAGE_ID=zh_CN; COOKIE_SUPPORT=true; __ozlvd671=1302764071; __ozlvd=1302764071
Host:www.shandongair.com
Method:POST /c/portal/render_portlet HTTP/1.1
Origin:http://www.shandongair.com
Referer:http://www.shandongair.com/web/shair_zh/news
User-Agent:Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16"""
headerString = headerString.split("\n")
dataString = """p_l_id:89978
p_p_id:journal_news_INSTANCE_1h76
p_p_action:0
p_p_state:normal
p_p_mode:view
p_p_col_id:column-3
p_p_col_pos:2
p_p_col_count:3
currentURL:/web/shair_zh/news"""  
dataString = dataString.split("\n")
dataDict = {}
for line in dataString:
line = line.split(":")
dataDict[line[0]] = line[1]

url = "http://www.shandongair.com/c/portal/render_portlet"
request = urllib2.Request(url)
for line in headerString:
line = line.strip().split(":")
request.add_header(line[0], line[1])  
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
page = 1
while page != -1:
dataDict["cur"] = "%s" % page
data = urllib.urlencode(dataDict)
page += 1
sourceFile = opener.open(request, data).read()   
conn, cursor = mysqlDataBase_init()
pq = PyQuery(unicode(sourceFile, "utf8"))
i = 0
while True:
a = pq("table.taglib-search-iterator a:eq(%s)" % i)
span = pq("table.taglib-search-iterator span:eq(%s)" % i)
i += 1
newsUrl = a.attr("href")
if not newsUrl:
break
newsUrl = "http://www.shandongair.com" + newsUrl
if checkNewsUrl(cursor, newsUrl):
page = -1#终止条件
break
title = a.text()
postDate = span.text()
if postDate < earliestTime:
page = -1#终止条件
break
sourceFile = urllib.urlopen(newsUrl).read()
newspq = PyQuery(unicode(sourceFile, "utf8"))
content = newspq("td.main").text()
source = ""
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
conn.commit()
conn.close()        
pass

#山东航空的首页新闻
newsPageUrl = "http://www.shandongair.com/"
conn, cursor = mysqlDataBase_init()
pq = PyQuery(newsPageUrl)
i = 0
while True:
a = pq("div.ind_news_box_center a:eq(%s)" % i)
i += 1
newsUrl = a.attr("href")
if not newsUrl:
break
if checkNewsUrl(cursor, newsUrl):
break
sourceFile = urllib.urlopen(newsUrl).read()
sourceFile = unicode(sourceFile, "utf-8")
newspq = PyQuery(sourceFile)
content = newspq("body table:eq(1)").text()[:-6]
title = newspq("title").text()
m = re.search(r"[\d]{4}年[\d]+月[\d]+日", content)
if m:
postDate = m.group()
postDate = parseDate(postDate)
else:
postDate = time.strftime("%Y%m%d")
source = ""
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
conn.commit()
conn.close()
pass
def jixianghangkong():
"""吉祥航空
没有看见新闻,就找了促销信息"""
airport = "吉祥航空"
page = 1
while page != -1:
newsPageUrl = "http://www.juneyaoair.com/JY_Airlines_dtlist.jsp?page=%s&menu_no=203" % page
page += 1
try:
pq = PyQuery(newsPageUrl)
except:
break
conn, cursor = mysqlDataBase_init()
table = pq("table:eq(18)")
trindex = 1
while True:
tr = table("tr:eq(%s)" % trindex)
trindex += 1
if not tr:
break         
a = tr("td a")
newsUrl = "http://www.juneyaoair.com/" + a.attr("href")
title = a.text()
source = tr("td:eq(3)").text()
postDate = tr("td:eq(4)").text()
if not postDate:
break
if postDate < earliestTime:
page = -1#终止条件
break
if checkNewsUrl(cursor, newsUrl):
page = -1#终止条件
break           
newspq = PyQuery(newsUrl)
content = newspq("table:eq(6)").text()
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
conn.commit()
conn.close()
pass
pass
def nanfanghangkong():
"""南方航空
还有一定的问题"""
airport = "南方航空"
year = int(time.strftime("%Y"))
match = re.compile(r"[\d]{4}-[\d]{2}-[\d]{2}")
conn, cursor = mysqlDataBase_init()
while str(year) >= earliestTime[:4]:
newsBasePage = "http://www.csair.com/cn/aboutcsn/04/newsCompany/%s/list_news_%s.asp" % (year, year)
sourceFile = urllib.urlopen(newsBasePage).read()
pq = PyQuery(unicode(sourceFile, "utf8"))
liindex = 0
while True:
li = pq("li:eq(%s)" % liindex)
liindex += 1
a = li("a")
newsUrl = a.attr("href")
if not newsUrl:
break
newsUrl = "http://www.csair.com" + newsUrl   
postDate = match.search(li.text()).group()
if postDate < earliestTime:
return
if checkNewsUrl(cursor, newsUrl):
return            
newspq = PyQuery(newsUrl)
content = newspq("#content").text()      
title = content.split()[2]
source = ""   
if content.find("InstanceBeginEditable") != -1:
content = content[len("InstanceBeginEditable name=\"con\" "):]
if content.find("InstanceEndEditable") != -1:
content = content[:-len("InstanceEndEditable")]
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
year -= 1
pass
conn.commit()
conn.close()
pass
def chunqiuhangkongwang():
"""春秋航空网"""
airport = "春秋航空"
source = "春秋航空网"
newsBasePageUrl = "http://info.china-sss.com/article_ps.do?xsl=topic_first_ilist_e&pt=1&ps=30&pi=%s&sf=seq_num&sfdt=number&st=ascending&cid=8069761218&bid=1&qtopic=3&qtit=&qct1=&qct2=#"
conn, cursor = mysqlDataBase_init()
sourceMatch = re.compile(r"来源:.*\[")
page = 1
while page != -1:
newsPageUrl = newsBasePageUrl % page      
page += 1
sourceFile = urllib.urlopen(newsPageUrl).read()
pq = PyQuery(unicode(sourceFile, "gbk"))
i = 0
while True:
a = pq("a:eq(%s)" % i)
tr = pq("td#container1 tr:eq(%s)" % i)
i += 1
newsUrl = a.attr("href")
if not newsUrl:
break
if len(newsUrl) == 1:
continue   
title = a.text()      
newsUrl = "http://info.china-sss.com" + newsUrl
postDate = "20" + tr("td:eq(2)").text()
if postDate < earliestTime:
page = -1
break   
if checkNewsUrl(cursor, newsUrl):
page = -1
break  
sourceFile = urllib.urlopen(newsUrl).read()
newspq = PyQuery(unicode(sourceFile, "gbk"))
content = newspq("td.fontmain1").text()
if content.find("function showContent()") != -1:
content = content[:content.find("function showContent()")]
sourceInContent = sourceMatch.search(content)
if sourceInContent:
source = sourceInContent.group()[len("来源:"):-1]
if source.find("】") != -1:
source = source[:-len("】")]
content = content.replace("[/链接]", "")
if content.find("[链接 地址=") != -1:
content = content[:-content.find("[链接 地址=")]        
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
#恢复默认
source = "春秋航空网"
pass
pass
conn.commit()
conn.close()
pass

def minhangziyuanwang():
source = "民航资源网"
matchPostDate = re.compile(r"[\d]+-[\d]+-[\d]+")
conn, cursor = mysqlDataBase_init()
au = ["天津航空", "吉祥航空", "厦门航空", "四川航空", "中国航空"]
for airport in au:
url = "http://news.carnoc.com/search.jsp?key=%s&querytype=0&page=1"   
url = url % (urllib.quote(airport.decode("utf8").encode("gb2312")))
sourceFile = urllib.urlopen(url).read()
pq = PyQuery(sourceFile)
index = 0
while True:
li = pq("div.text li:eq(%s)" % index)
index += 1
if not li:
break
newsUrl = li("a").attr("href")
title = li("a").text()
match = matchPostDate.search(li.text())
if not match:
continue
postDate = match.group()
if newsUrl.find("2000") != -1:
continue
if checkNewsUrl(cursor, newsUrl):
break
sourceFile = urllib.urlopen(newsUrl).read()
newspq = PyQuery(sourceFile)
content = newspq("div#newstext").text()
if not content:
continue
if len(content) < 100:
continue
executeSql(cursor, title, postDate, content, airport, source, newsUrl)
pass
#恢复默认   
source = ""
conn.commit()
conn.close()
pass
def crawl_news():
startTime = time.time()
#最总版本使用的
import jipiao
threads = []
#添加线程
threads.append(threading.Thread(target=minhangziyuanwang))
for f in dir(jipiao):
if f.find("hangkong") != -1:
exec("threads.append(threading.Thread(target=%s))" % f)
#启动线程
for t in threads:
t.start()
#等待线程结束
for t in threads:
t.join()
printf((time.time() - startTime))   
pass
if __name__ == "__main__":
startTime = time.time()
crawl_news()
#写日志
if not os.path.exists("logs"):
os.mkdir("logs")
logs = file("./logs/logs.txt", "a")
now = time.strftime("%Y-%m-%d %H:%M:%S")
logs.write("%s\t耗时:%sms\n" % (now, time.time() - startTime))
logs.close()
printf(time.time() - startTime)
#    chongqinghangkong()
#    chunqiuhangkong()
#    chunqiuhangkongwang()#春秋航空网
#    dongfanghangkong()
#    hannanhangkong()
#    jixianghangkong()
#    nanfanghangkong()
#    shandonghangkong()#ajax不易处理,使用post
#    shanghaihangkong()
#    shanghaihangkong_old()#上海航空旧版
#    shenzhenhangkong()
#    sichuanhangkong()
#    tianjinhangkong()
#    xiamenhangkong()
#    xingfuhangkong()
#    zhongguohangkong()
#    minhangziyuanwang()

创建数据的SQL
CREATE TABLE IF NOT EXISTS `jipiaonews` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` text NOT NULL,
`postDate` datetime NOT NULL,
`content` text NOT NULL,
`airport` varchar(255) NOT NULL,
`source` varchar(255) NOT NULL,
`newsUrl` varchar(255) NOT NULL DEFAULT '',
PRIMARY KEY (`id`)
) ENGINE=MyISAM  DEFAULT CHARSET=utf8;

运维网声明 1、欢迎大家加入本站运维交流群:群②:261659950 群⑤:202807635 群⑦870801961 群⑧679858003
2、本站所有主题由该帖子作者发表,该帖子作者与运维网享有帖子相关版权
3、所有作品的著作权均归原作者享有,请您和我们一样尊重他人的著作权等合法权益。如果您对作品感到满意,请购买正版
4、禁止制作、复制、发布和传播具有反动、淫秽、色情、暴力、凶杀等内容的信息,一经发现立即删除。若您因此触犯法律,一切后果自负,我们对此不承担任何责任
5、所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其内容的准确性、可靠性、正当性、安全性、合法性等负责,亦不承担任何法律责任
6、所有作品仅供您个人学习、研究或欣赏,不得用于商业或者其他用途,否则,一切后果均由您自己承担,我们对此不承担任何法律责任
7、如涉及侵犯版权等问题,请您及时通知我们,我们将立即采取措施予以解决
8、联系人Email:admin@iyunv.com 网址:www.yunweiku.com

所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其承担任何法律责任,如涉及侵犯版权等问题,请您及时通知我们,我们将立即处理,联系人Email:kefu@iyunv.com,QQ:1061981298 本贴地址:https://www.yunweiku.com/thread-372967-1-1.html 上篇帖子: python JSON string 的中文问题 下篇帖子: google app engine python 汉字编码问题
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

扫码加入运维网微信交流群X

扫码加入运维网微信交流群

扫描二维码加入运维网微信交流群,最新一手资源尽在官方微信交流群!快快加入我们吧...

扫描微信二维码查看详情

客服E-mail:kefu@iyunv.com 客服QQ:1061981298


QQ群⑦:运维网交流群⑦ QQ群⑧:运维网交流群⑧ k8s群:运维网kubernetes交流群


提醒:禁止发布任何违反国家法律、法规的言论与图片等内容;本站内容均来自个人观点与网络等信息,非本站认同之观点.


本站大部分资源是网友从网上搜集分享而来,其版权均归原作者及其网站所有,我们尊重他人的合法权益,如有内容侵犯您的合法权益,请及时与我们联系进行核实删除!



合作伙伴: 青云cloud

快速回复 返回顶部 返回列表