shenzhang 发表于 2015-12-2 09:22:52

初识python之 APP store排行榜 蜘蛛抓取(三)

#coding=utf-8
import time
import urllib2
import Queue
import threading
import xml.dom.minidom
import MySQLdb

#获取数据
def get_appstop_data(url):
myurl = url   
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers={ 'User-Agent' : user_agent }
req=urllib2.Request(myurl, headers = headers)
myResponse=urllib2.urlopen(req)
try:
myResponse = urllib2.urlopen(req)
except urllib2.HTTPError, e:
print e.fp.read()   
resultXml = myResponse.read()
return resultXml

def get_attrvalue(node, attrname):
if node:
return node.getAttribute(attrname)
else:
return ''
def get_xmlnode(node,name):
if node:
return node.getElementsByTagName(name)
else:
return []
def get_nodevalue(node, index = 0):
if node:
return node.childNodes.nodeValue
else:
return ''
#解析XML
def get_xml_data(xml_string,table,type,itunesid=0):
doc = xml.dom.minidom.parseString(xml_string)
app_list = []
for index,node in enumerate(doc.getElementsByTagName('entry')):
#获取app应用标题
node_title = get_xmlnode(node,'title')
#获取应用名称
node_name = get_xmlnode(node,'im:name')
#获取更新时间
node_updated = get_xmlnode(node,'updated')
#获取应用ID
node_appid = get_xmlnode(node,'id')
#获取应用类型   
node_app_category = get_xmlnode(node,'category')
#获取应用金额
node_price = get_xmlnode(node,'im:price')   
#获取应用详情
node_content = get_xmlnode(node,'content')
#获取应用LOGO
node_logo = get_xmlnode(node,'im:image')      

#应用标题
app_title = get_nodevalue(node_title).encode('utf-8','ignore')
#应用名称
app_name = get_nodevalue(node_name).encode('utf-8','ignore')
#应用appStop URL
app_url = get_nodevalue(node_appid).encode('utf-8','ignore')
#应用ID
app_id = get_attrvalue(node_appid,'im:id').encode('utf-8','ignore')
#应用类型ID
app_category_id = get_attrvalue(node_app_category,'im:id').encode('utf-8','ignore')
#应用类型名称
app_category_name = get_attrvalue(node_app_category,'label').encode('utf-8','ignore')
#获取应用金额
app_price = get_attrvalue(node_price,'amount').encode('utf-8','ignore')
#获取应用详情
app_content = get_nodevalue(node_content).encode('utf-8','ignore')
#获取应用LOgo
app_logo = get_nodevalue(node_logo).encode('utf-8','ignore')
rank = index+1
app_tmp = {}
app_tmp['rank'], app_tmp['title'], app_tmp['name'], app_tmp['url'], app_tmp['id'], app_tmp['category_id'], app_tmp['category_name'], app_tmp['price'], app_tmp['logo'], app_tmp['table'], app_tmp['type'], app_tmp['itunesid'] = (rank, app_title, app_name, app_url, app_id, app_category_id, app_category_name, app_price, app_logo, table, type, itunesid)
app_list.append(app_tmp)
return app_list

#配置抓取参数
def marge_url():
url_list = []
#国家
country = ['cn','tw','hk','us']
#大分类
type = [['toppaidapplications',1],['topfreeapplications',2],['topgrossingapplications',3],['toppaidipadapplications',4],['topfreeipadapplications',5],['topgrossingipadapplications',6]]
#小分类
genre = ['6000','6001','6002','6003','6004','6005','6006','6007','6008','6009','6010','6011','6012','6013','6014','6015','6016','6017','6018','6020','6022','6023']
for k,v in enumerate(country):
for k1,v1 in enumerate(type):
url = ['http://itunes.apple.com/',v,'/rss/',v1,'/limit=200/xml']
urlStr = [''.join(url),v,v1,0]
url_list.append(urlStr)
for k2,v2 in enumerate(genre):
url = ['http://itunes.apple.com/',v,'/rss/',v1,'/limit=200/genre=',v2,'/xml']
urlStr = [''.join(url),v,v1,v2]
url_list.append(urlStr)
return url_list
#执行Sql操作
def DbSql(sql,type):
try:
if sql:
conn = MySQLdb.connect(host="主机", user="用户", passwd="密码", db="数据库",charset='utf8')
cursor = conn.cursor()
cursor.execute(sql)   
if type == 'select':
data = cursor.fetchone()
elif type == 'add':
data = int(conn.insert_id())
else:
data = 'nokey'
cursor.close()
conn.commit()
conn.close()
return data
else:
return
except MySQLdb.Error,e:
errMsg = "Mysql Error %d: %s" %(e.args,e.args)
print errMsg
file_object = open('PythonLogo.log', 'w+')
file_object.write(errMsg)
file_object.close()

def addAppRank(app_list,startTime):
for index,item in enumerate(app_list):
sql = "SELECT app_id FROM app1_info_"+item.get('table')+" WHERE app_id_apple="+item.get('id')
res = DbSql(sql,'select')
if res is None:
name = item.get('name').replace("\'", "\\\'").replace("\"","\\\"")
addSql = "INSERT INTO app1_info_"+item.get('table')+"(`app_id_apple`,`app_type`,`app_name`,`app_itunesid`,`collect_country`,`app_logo`,`app_price`) VALUES(%s,'%s','%s','%s','%s','%s','%s')" %(item.get('id'),item.get('type'),name,item.get('category_id'),item.get('table'),item.get('logo'),item.get('price'))
key_id = DbSql(addSql,'add')
print 'insert'
else:
print 'select'
key_id = res      
print key_id
#查询上一次排名
prevSql = "SELECT collect_now_rank FROM app1_collect_%s WHERE collect_app_id=%s AND collect_country='%s' AND collect_app_type=%s AND collect_app_itunesid=%s ORDER BY collect_ctime desc" %(item.get('table'),key_id,item.get('table'),item.get('type'),item.get('itunesid'))
prevRes = DbSql(prevSql,'select')
if prevRes is None:
prev_rank = 0
else:
prev_rank = prevRes
collectSql = "INSERT INTO app1_collect_"+item.get('table')+"(`collect_app_id`,`collect_ctime`,`collect_now_rank`,`collect_prev_rank`,`collect_app_type`,`collect_app_itunesid`,`collect_country`) VALUES(%s,'%s','%s','%s','%s','%s','%s')" %(key_id,startTime,item.get('rank'),prev_rank,item.get('type'),item.get('itunesid'),item.get('table'))
res = DbSql(collectSql,'addnokey')
#暂停0.2秒
time.sleep(0.2)

#q是任务队列
#NUM是并发线程总数
#JOBS是有多少任务
q = Queue.Queue()
NUM = 50
JOBS = marge_url()
startTime = int(time.time())
threads = []
mutex = threading.Lock()
class MyThread(threading.Thread):
def __init__(self, work_queue,startTime):
threading.Thread.__init__(self)
self.work_queue = work_queue
def run(self):
global mutex
while True:
if self.work_queue.qsize()>0:
arguments = self.work_queue.get()
#threadname = threading.currentThread().getName()
#print threadname,arguments
mutex.acquire()      
#抓取网页
app_string = get_appstop_data(arguments)
#解析
app_arr = get_xml_data(app_string,arguments,arguments,arguments)
#添加数据库
addAppRank(app_arr,startTime)
mutex.release()
time.sleep(1)
else:
break

if __name__ == '__main__':
for work in JOBS:
q.put(work)   
for k in range(NUM):
threads.append(MyThread(q,startTime))
for t in threads:
t.setDaemon(True)   
t.start()
for h in threads:
h.join()
print '主线程运行结束';


  cnblogs不能上传文件, 需要源文件的可以 联系我!QQ:516317457
页: [1]
查看完整版本: 初识python之 APP store排行榜 蜘蛛抓取(三)