用python通过apache log 获取百度搜索来源关键词

tilac · 发表于 2015-8-3 08:19:57

　　apache log格式

127.0.0.1 - - [24/Feb/2011:19:20:27 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=44200 HTTP/1.1" 200 88
127.0.0.1 - - [24/Feb/2011:19:20:28 +0800] "GET /seo/index2.php HTTP/1.1" 200 1228
127.0.0.1 - - [24/Feb/2011:19:20:28 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index2.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=85596 HTTP/1.1" 200 88
127.0.0.1 - - [24/Feb/2011:19:20:29 +0800] "GET /seo/index.php HTTP/1.1" 200 844
127.0.0.1 - - [24/Feb/2011:19:20:29 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=88069 HTTP/1.1" 200 88
127.0.0.1 - - [24/Feb/2011:19:20:30 +0800] "GET /seo/index2.php HTTP/1.1" 200 1228
127.0.0.1 - - [24/Feb/2011:19:20:30 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index2.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=65456 HTTP/1.1" 200 88
127.0.0.1 - - [24/Feb/2011:19:20:31 +0800] "GET /seo/index.php HTTP/1.1" 200 844
127.0.0.1 - - [24/Feb/2011:19:20:31 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=91624 HTTP/1.1" 200 88
127.0.0.1 - - [24/Feb/2011:19:20:31 +0800] "GET /seo/index2.php HTTP/1.1" 200 1228
127.0.0.1 - - [24/Feb/2011:19:20:31 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index2.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=68220 HTTP/1.1" 200 88
127.0.0.1 - - [24/Feb/2011:19:20:32 +0800] "GET /seo/index.php HTTP/1.1" 200 844
127.0.0.1 - - [24/Feb/2011:19:20:32 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=37909 HTTP/1.1" 200 88
127.0.0.1 - - [24/Feb/2011:19:20:32 +0800] "GET /seo/index2.php HTTP/1.1" 200 1228
127.0.0.1 - - [24/Feb/2011:19:20:32 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index2.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=53594 HTTP/1.1" 200 88
127.0.0.1 - - [24/Feb/2011:19:20:33 +0800] "GET /seo/index.php HTTP/1.1" 200 844
127.0.0.1 - - [24/Feb/2011:19:20:33 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=32830 HTTP/1.1" 200 88
　　python代码

#!/usr/bin/python
# -#- coding: utf-8 -*-

import os, base64, re, fnmatch, imghdr, shutil, pprint, urlparse
log = "seo.log"
reader = open(log, 'r')
contents = []
for line in reader.xreadlines() :
p = re.compile('.*"GET (\/seo\/t\.php\?.*) HTTP\/1\.1".*', re.IGNORECASE)
m = p.match(line)
if m :
      res_file = m.group(1)
      #print res_file
      cs = urlparse.urlparse(res_file)
      #cs_lem = len(cs)
      #pprint.pprint(cs)
      s_q = urlparse.parse_qs(cs.query,True)
      ref = urlparse.urlparse(str(s_q['ref'][0]))
      ref_wd = urlparse.parse_qs(ref.query,True)
      print ref_wd['wd'][0]
else :
      contents.append(line)
reader.close()
　　如果要统计google soso baidu

#!/usr/bin/python
# -#- coding: utf-8 -*-

import os, base64, re, fnmatch, imghdr, shutil, pprint, urlparse
log = "seo.log"
reader = open(log, 'r')
config =  {'s0':{'h':'www.google.com.hk','q':'q'},'s1':{'h':'www.baidu.com','q':'wd|word'},'s3':{'h':'www.soso.com','q':'w'}}
def get_q(x):
for i,j in config.items():
      str_q = j['q'].split('|')
      if x.netloc == j['h']:
         return str_q
for line in reader.xreadlines() :
p = re.compile('.*"GET (\/seo\/t\.php\?.*) HTTP\/1\.1".*', re.IGNORECASE)
m = p.match(line)
if m :
      s_t = m.group(1)
      s_t_u = urlparse.urlparse(s_t)
      s_t_u_qs = urlparse.parse_qs(s_t_u.query,True)
      #print s_t_u_qs['ref'][0]
      ref = urlparse.urlparse(str(s_t_u_qs['ref'][0]))
      ref_qs = urlparse.parse_qs(ref.query,True)
      #print ref
      #print get_q(ref)
      for k in get_q(ref):
         if k in ref_qs:
            print ref.netloc+":::"+ref_qs[k][0]
reader.close()
　　结果如下

---------- Python ----------
www.baidu.com:::php referer
www.baidu.com:::php referer
www.baidu.com:::php referer
www.baidu.com:::php referer
www.baidu.com:::php referer
www.baidu.com:::php referer
www.soso.com:::js urlencode
www.baidu.com:::php referer
www.google.com.hk:::urldecode js
输出完毕 (耗时 0 秒) - 正常终止

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

Red Hat RHCE 8 (EX294) Cert Guide

c++ size_t 和 int 的区别

HERE 使用 AWS EF 和 JFrog Artifactory 打

C++ 指针大全：从基础到进阶，一篇快速上手

wirelessnetview好用的无线分析工具

亿图图示专家(EDraw Max) V7.9 中文破解版

[经验分享] 用python通过apache log 获取百度搜索来源关键词

浏览过的版块

扫码加入运维网微信交流群