import os, base64, re, fnmatch, imghdr, shutil, pprint, urlparse
log = "seo.log"
reader = open(log, 'r')
contents = []
for line in reader.xreadlines() :
p = re.compile('.*"GET (\/seo\/t\.php\?.*) HTTP\/1\.1".*', re.IGNORECASE)
m = p.match(line)
if m :
res_file = m.group(1)
#print res_file
cs = urlparse.urlparse(res_file)
#cs_lem = len(cs)
#pprint.pprint(cs)
s_q = urlparse.parse_qs(cs.query,True)
ref = urlparse.urlparse(str(s_q['ref'][0]))
ref_wd = urlparse.parse_qs(ref.query,True)
print ref_wd['wd'][0]
else :
contents.append(line)
reader.close()
如果要统计google soso baidu
#!/usr/bin/python
# -#- coding: utf-8 -*-
import os, base64, re, fnmatch, imghdr, shutil, pprint, urlparse
log = "seo.log"
reader = open(log, 'r')
config = {'s0':{'h':'www.google.com.hk','q':'q'},'s1':{'h':'www.baidu.com','q':'wd|word'},'s3':{'h':'www.soso.com','q':'w'}}
def get_q(x):
for i,j in config.items():
str_q = j['q'].split('|')
if x.netloc == j['h']:
return str_q
for line in reader.xreadlines() :
p = re.compile('.*"GET (\/seo\/t\.php\?.*) HTTP\/1\.1".*', re.IGNORECASE)
m = p.match(line)
if m :
s_t = m.group(1)
s_t_u = urlparse.urlparse(s_t)
s_t_u_qs = urlparse.parse_qs(s_t_u.query,True)
#print s_t_u_qs['ref'][0]
ref = urlparse.urlparse(str(s_t_u_qs['ref'][0]))
ref_qs = urlparse.parse_qs(ref.query,True)
#print ref
#print get_q(ref)
for k in get_q(ref):
if k in ref_qs:
print ref.netloc+":::"+ref_qs[k][0]
reader.close()
结果如下