|
- 主机环境:Ubuntu 13.04
- Python版本:2.7.4
- Django版本:1.5.4
- Scrapy版本:0.18.2
- ElasticSearch版本:0.90.5
转载请标明:http://blog.yanming8.cn/archives/138 闲来无聊,查看了相关搜索引擎的基本知识,经过搜集资料,了解了搜索引擎所需要的基本子系统,爬取子系统,索引服务子系统,Web请求和应答子系统。然后经过学习基本的开源框架文档,集成的项目已经PUSH到GitHub。
首先查看基于开源的Scrapy爬虫框架编写的一个爬虫,爬取校园网的内容(主要是免流量)
view source
03 | #from urlparse import urljoin |
04 | from scrapy.utils.url import urljoin_rfc |
05 | from scrapy.spider import BaseSpider |
06 | from scrapy.selector import HtmlXPathSelector |
07 | from scrapy.http import Request |
09 | from scrapy.exceptions import DropItem |
11 | from mymodules.items import Website |
16 | class Xidian_Spider(BaseSpider): |
17 | name = "xidian_spider" |
19 | "http://www.xidian.edu.cn", |
20 | #"http://rs.xidian.edu.cn/forum.php", |
25 | """init the allowed_domain""" |
26 | self.allowed_domains = ['xidian.edu.cn'] |
28 | def parse(self, response): |
29 | """In this parse,we use double yeild to return the item or Request""" |
30 | hxs = HtmlXPathSelector(response) |
32 | refer_websites = hxs.select('//@href').extract() |
34 | #if not self.gethostname(response.url) in self.allowed_domains: |
35 | # self.allowed_domains.append(self.gethostname(response.url)) |
38 | item['url'] = response.url |
39 | item['title'] = hxs.select('/html/head/title/text()').extract()[0] |
41 | """FIXME:This XPath select all the elements,include the javascript code.BAD!!""" |
43 | list = hxs.select('/html/body//*/text()').extract() |
52 | for weburl in refer_websites: |
54 | utf8_url = weburl.encode('utf-8') |
56 | """The following regex to match the prefix and postfix of urls""" |
57 | postfix = re.compile(r'.+\.((jpg)|(ico)|(rar)|(zip)|(doc)|(ppt)|(xls)|(css)|(exe)|(pdf))x?$') |
58 | prefix = re.compile(r'^((javascript:)|(openapi)).+') |
60 | if postfix.match(utf8_url): |
62 | if prefix.match(utf8_url): |
64 | if not utf8_url.startswith('http://'): |
65 | #weburl = urljoin_rfc(response.url, weburl, response.encoding) |
66 | weburl = 'http://'+self.gethostname(response.url)+'/'+weburl |
68 | weburl = re.sub(r'/\.\./\.\./',r'/',weburl) |
69 | weburl = re.sub(r'/\.\./',r'/',weburl) |
71 | yield Request(weburl, callback=self.parse) |
73 | def gethostname(self, res_url): |
74 | """get the host name of a url""" |
75 | proto, rest = urllib.splittype(res_url) |
76 | host, rest = urllib.splithost(rest) |
爬取得到的ITEM会交给PIPELINE处理。
这里的PipeLine做了去重处理,不能简单的放在内容,所以使用的是Bloom Filter的算法,这里直接安装了Python的开源库中的pybloomfilter(有时间研究一下)
view source
01 | class DuplicatesPipeline(object): |
04 | self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') |
05 | self.f_write = open('visitedsites','w') |
06 | self.si = SearchIndex() |
09 | def process_item(self, item, spider): |
10 | print '************%d pages visited!*****************' %len(self.bf) |
11 | if self.bf.add(item['url']):#True if item in the BF |
12 | raise DropItem("Duplicate item found: %s" % item) |
14 | #print '%d pages visited!'% len(self.url_seen) |
15 | self.save_to_file(item['url'],item['title']) |
16 | self.si.AddIndex(item) |
19 | def save_to_file(self,url,utitle): |
20 | self.f_write.write(url) |
21 | self.f_write.write('\t') |
22 | self.f_write.write(utitle.encode('utf-8')) |
23 | self.f_write.write('\n') |
26 | """docstring for __del__""" |
该类中的SearchIndex是ElasticSearch建立索引的类。定义如下:
view source
06 | from mymodules.items import Website |
07 | INDEX_NAME='xidian_spider' |
09 | class SearchIndex(object): |
12 | self.conn = ES('127.0.0.1:9200', timeout=3.5)#Connect to ES |
14 | self.conn.delete_index(INDEX_NAME) |
18 | self.conn.create_index(INDEX_NAME)#Create a new INDEX |
20 | #Define the structure of the data format |
21 | mapping = {u'content': {'boost': 1.0, |
26 | "searchAnalyzer":"ik", |
27 | "term_vector" : "with_positions_offsets"}, |
28 | u'title': {'boost': 1.0, |
33 | "searchAnalyzer":"ik", |
34 | "term_vector" : "with_positions_offsets"}, |
35 | u'url': {'boost': 1.0, |
39 | #"indexAnalyzer":"ik", |
40 | #"searchAnalyzer":"ik", |
41 | "term_vector" : "with_positions_offsets"}, |
44 | self.conn.put_mapping("searchEngine-type", {'properties':mapping}, [INDEX_NAME])#Define the type |
46 | def AddIndex(self,item): |
48 | print 'Adding Index item URL %s'% item['title'].encode('utf-8') |
49 | self.conn.index({'title':item['title'].encode('utf-8'), \ |
50 | 'url':item['url'].encode('utf-8'),\ |
51 | 'content':item['content'].encode('utf-8')\ |
52 | },INDEX_NAME,'searchEngine-type') |
55 | self.conn.default_indices=[INDEX_NAME]#Set the default indices |
56 | self.conn.refresh()#Refresh the ES |
其中中文分词使用的是IK分词,Python库中直接安装即可。
Django中接受搜索请求的处理函数如下:
view source
02 | """docstring for search""" |
03 | if 'q' in request.GET: |
06 | if 'page' in request.GET: |
07 | page = unicode(request.GET['page']) |
11 | results = dosearch(q,page)#connect to ES to return the results |
13 | return render(request,'res_search.html', {'results' : results, |
18 | 'nextpage':int(page)+1}) |
20 | message = 'You submitted an empty form.' |
21 | return HttpResponse(message) |
其中调用dosearch函数进行连接ES查询,函数内容如下:
view source
01 | def dosearch(string,upage): |
02 | conn = ES('127.0.0.1:9200', timeout=3.5)#连接ES |
03 | fq_title = FieldQuery(analyzer='ik') |
04 | fq_title.add('title',string) |
06 | fq_content = FieldQuery(analyzer='ik') |
07 | fq_content.add('content',string) |
09 | bq = BoolQuery(should=[fq_title,fq_content]) |
11 | h=HighLighter(['['], [']'], fragment_size=100) |
13 | page = int(upage.encode('utf-8')) |
17 | s=Search(bq,highlight=h,start=(page-1)*PAGE_SIZE,size=PAGE_SIZE) |
18 | s.add_highlight("content") |
19 | s.add_highlight('title') |
20 | results=conn.search(s,indices='xidian_spider',doc_types='searchEngine-type') |
24 | if(r._meta.highlight.has_key("title")): |
25 | r['title']=r._meta.highlight[u"title"][0] |
26 | if(r._meta.highlight.has_key('content')): |
27 | r['content']=r._meta.highlight[u'content'][0] |
30 | res.content = r['content'] |
31 | res.title = r['title'] |
|
|