搜索引擎–基于Django/Scrapy/ElasticSearch的搜索引擎的实现（转）

判官007 发表于 2017-5-21 10:58:55

[*]主机环境：Ubuntu 13.04
[*]Python版本：2.7.4
[*]Django版本：1.5.4
[*]Scrapy版本：0.18.2
[*]ElasticSearch版本：0.90.5
转载请标明：http://blog.yanming8.cn/archives/138　　闲来无聊，查看了相关搜索引擎的基本知识，经过搜集资料，了解了搜索引擎所需要的基本子系统，爬取子系统，索引服务子系统，Web请求和应答子系统。然后经过学习基本的开源框架文档，集成的项目已经PUSH到GitHub。
首先查看基于开源的Scrapy爬虫框架编写的一个爬虫，爬取校园网的内容（主要是免流量）
view source

01#!/usr/bin/env python
02#-*- coding:utf-8 -*-
03#from urlparse import urljoin
04from scrapy.utils.url import urljoin_rfc
05from scrapy.spider import BaseSpider
06from scrapy.selector import HtmlXPathSelector
07from scrapy.http import Request
08
09from scrapy.exceptions import DropItem
10
11from mymodules.items import Website
12
13import urllib
14import re
15
16class Xidian_Spider(BaseSpider):
17    name = "xidian_spider"
18    start_urls = [
19       "http://www.xidian.edu.cn",
20       #"http://rs.xidian.edu.cn/forum.php",
21
22    ]
23
24    def __init__(self):
25        """init the allowed_domain"""
26        self.allowed_domains = ['xidian.edu.cn']
27
28    def parse(self, response):
29        """In this parse,we use double yeild to return the item or Request"""
30        hxs = HtmlXPathSelector(response)
31
32        refer_websites = hxs.select('//@href').extract()
33
34        #if not self.gethostname(response.url) in self.allowed_domains:
35        #    self.allowed_domains.append(self.gethostname(response.url))
36
37        item = Website()
38        item['url'] = response.url
39        item['title'] = hxs.select('/html/head/title/text()').extract()[0]
40
41        """FIXME:This XPath select all the elements,include the javascript code.BAD!!"""
42        str = ''
43        list = hxs.select('/html/body//*/text()').extract()
44        for s in list:
45            str += s.strip()
46            str += ' '
47
48        item['content'] = str
49
50        yield item
51
52        for weburl in refer_websites:
53
54            utf8_url = weburl.encode('utf-8')
55
56            """The following regex to match the prefix and postfix of urls"""
57            postfix = re.compile(r'.+\.((jpg)|(ico)|(rar)|(zip)|(doc)|(ppt)|(xls)|(css)|(exe)|(pdf))x?$')
58            prefix = re.compile(r'^((javascript:)|(openapi)).+')
59
60            if postfix.match(utf8_url):
61                continue
62            if prefix.match(utf8_url):
63                continue
64            if not utf8_url.startswith('http://'):
65                #weburl = urljoin_rfc(response.url, weburl, response.encoding)
66                weburl = 'http://'+self.gethostname(response.url)+'/'+weburl
67
68            weburl = re.sub(r'/\.\./\.\./',r'/',weburl)
69            weburl = re.sub(r'/\.\./',r'/',weburl)
70
71            yield Request(weburl, callback=self.parse)
72
73    def gethostname(self, res_url):
74        """get the host name of a url"""
75        proto, rest = urllib.splittype(res_url)
76        host, rest = urllib.splithost(rest)
77        return host

爬取得到的ITEM会交给PIPELINE处理。
这里的PipeLine做了去重处理，不能简单的放在内容，所以使用的是Bloom Filter的算法，这里直接安装了Python的开源库中的pybloomfilter（有时间研究一下）
view source

01class DuplicatesPipeline(object):
02
03    def __init__(self):
04        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
05        self.f_write = open('visitedsites','w')
06        self.si = SearchIndex()
07        self.si.SearchInit()
08
09    def process_item(self, item, spider):
10        print '************%d pages visited!*****************' %len(self.bf)
11        if self.bf.add(item['url']):#True if item in the BF
12            raise DropItem("Duplicate item found: %s" % item)
13        else:
14            #print '%d pages visited!'% len(self.url_seen)
15            self.save_to_file(item['url'],item['title'])
16            self.si.AddIndex(item)
17            return item
18
19    def save_to_file(self,url,utitle):
20        self.f_write.write(url)
21        self.f_write.write('\t')
22        self.f_write.write(utitle.encode('utf-8'))
23        self.f_write.write('\n')
24
25    def __del__(self):
26        """docstring for __del__"""
27        self.f_write.close()
28        self.si.IndexDone()

该类中的SearchIndex是ElasticSearch建立索引的类。定义如下：
view source

01#!/usr/bin/env python
02#-*- coding:utf-8-*-
03import os
04import sys
05from pyes import *
06from mymodules.items import Website
07INDEX_NAME='xidian_spider'
08
09class SearchIndex(object):
10
11    def SearchInit(self):
12        self.conn = ES('127.0.0.1:9200', timeout=3.5)#Connect to ES
13        try:
14            self.conn.delete_index(INDEX_NAME)
15            #pass
16        except:
17            pass
18        self.conn.create_index(INDEX_NAME)#Create a new INDEX
19
20        #Define the structure of the data format
21        mapping = {u'content': {'boost': 1.0,
22                          'index': 'analyzed',
23                          'store': 'yes',
24                          'type': u'string',
25                          "indexAnalyzer":"ik",
26                          "searchAnalyzer":"ik",
27                          "term_vector" : "with_positions_offsets"},
28                  u'title': {'boost': 1.0,
29                             'index': 'analyzed',
30                             'store': 'yes',
31                             'type': u'string',
32                             "indexAnalyzer":"ik",
33                             "searchAnalyzer":"ik",
34                             "term_vector" : "with_positions_offsets"},
35                  u'url': {'boost': 1.0,
36                             'index': 'analyzed',
37                             'store': 'yes',
38                             'type': u'string',
39                             #"indexAnalyzer":"ik",
40                             #"searchAnalyzer":"ik",
41                             "term_vector" : "with_positions_offsets"},
42        }
43
44        self.conn.put_mapping("searchEngine-type", {'properties':mapping}, )#Define the type
45
46    def AddIndex(self,item):
47
48        print 'Adding Index item URL %s'% item['title'].encode('utf-8')
49        self.conn.index({'title':item['title'].encode('utf-8'), \
50                'url':item['url'].encode('utf-8'),\
51                'content':item['content'].encode('utf-8')\
52                },INDEX_NAME,'searchEngine-type')
53
54    def IndexDone(self):
55        self.conn.default_indices=#Set the default indices
56        self.conn.refresh()#Refresh the ES

其中中文分词使用的是IK分词，Python库中直接安装即可。
Django中接受搜索请求的处理函数如下：
view source

01def search(request):
02    """docstring for search"""
03    if 'q' in request.GET:
04        q = request.GET['q']
05        print q
06        if 'page' in request.GET:
07            page = unicode(request.GET['page'])
08        else:
09            page = unicode(1)
10        start = clock()
11        results = dosearch(q,page)#connect to ES to return the results
12        end = clock()
13        return render(request,'res_search.html', {'results' : results,
14                                                    'query':q,
15                                                    'count':len(results),
16                                                    'time':end-start,
17                                                    'page':page,
18                                                    'nextpage':int(page)+1})
19    else:
20        message = 'You submitted an empty form.'
21        return HttpResponse(message)

其中调用dosearch函数进行连接ES查询，函数内容如下：
view source

01def dosearch(string,upage):
02    conn = ES('127.0.0.1:9200', timeout=3.5)#连接ES
03    fq_title = FieldQuery(analyzer='ik')
04    fq_title.add('title',string)
05
06    fq_content = FieldQuery(analyzer='ik')
07    fq_content.add('content',string)
08
09    bq = BoolQuery(should=)
10
11    h=HighLighter(['['], [']'], fragment_size=100)
12
13    page = int(upage.encode('utf-8'))
14    if page < 1:
15        page = 1
16
17    s=Search(bq,highlight=h,start=(page-1)*PAGE_SIZE,size=PAGE_SIZE)
18    s.add_highlight("content")
19    s.add_highlight('title')
20    results=conn.search(s,indices='xidian_spider',doc_types='searchEngine-type')
21
22    list=[]
23    for r in results:
24        if(r._meta.highlight.has_key("title")):
25            r['title']=r._meta.highlight"title"][0]
26        if(r._meta.highlight.has_key('content')):
27            r['content']=r._meta.highlight'content'][0]
28
29        res = Results()
30        res.content = r['content']
31        res.title = r['title']
32        res.url = r['url']
33        list.append(res)
34    return list

页: [1]

运维网's Archiver

搜索引擎–基于Django/Scrapy/ElasticSearch的搜索引擎的实现（转）