搜索引擎–基于Django/Scrapy/ElasticSearch的搜索引擎的实现（转）

判官007 · 发表于 2017-5-21 10:58:55

主机环境：Ubuntu 13.04
Python版本：2.7.4
Django版本：1.5.4
Scrapy版本：0.18.2
ElasticSearch版本：0.90.5

转载请标明：http://blog.yanming8.cn/archives/138　　闲来无聊，查看了相关搜索引擎的基本知识，经过搜集资料，了解了搜索引擎所需要的基本子系统，爬取子系统，索引服务子系统，Web请求和应答子系统。然后经过学习基本的开源框架文档，集成的项目已经PUSH到GitHub。
首先查看基于开源的Scrapy爬虫框架编写的一个爬虫，爬取校园网的内容（主要是免流量）
view source

01	#!/usr/bin/env python

02	#-- coding:utf-8 --

03	#from urlparse import urljoin

04	from scrapy.utils.url import urljoin_rfc

05	from scrapy.spider import BaseSpider

06	from scrapy.selector import HtmlXPathSelector

07	from scrapy.http import Request

08

09	from scrapy.exceptions import DropItem

10

11	from mymodules.items import Website

12

13	import urllib

14	import re

15

16	class Xidian_Spider(BaseSpider):

17	name = "xidian_spider"

18	start_urls = [

19	"http://www.xidian.edu.cn",

20	#"http://rs.xidian.edu.cn/forum.php",

21

22

]

23

24	def __init__(self):

25	"""init the allowed_domain"""

26	self.allowed_domains = ['xidian.edu.cn']

27

28	def parse(self, response):

29	"""In this parse,we use double yeild to return the item or Request"""

30	hxs = HtmlXPathSelector(response)

31

32	refer_websites = hxs.select('//@href').extract()

33

34	#if not self.gethostname(response.url) in self.allowed_domains:

35	# self.allowed_domains.append(self.gethostname(response.url))

36

37	item = Website()

38	item['url'] = response.url

39	item['title'] = hxs.select('/html/head/title/text()').extract()[0]

40

41	"""FIXME:This XPath select all the elements,include the javascript code.BAD!!"""

42

str = ''

43	list = hxs.select('/html/body//*/text()').extract()

44	for s in list:

45	str += s.strip()

46	str += ' '

47

48	item['content'] = str

49

50	yield item

51

52	for weburl in refer_websites:

53

54	utf8_url = weburl.encode('utf-8')

55

56	"""The following regex to match the prefix and postfix of urls"""

57	postfix = re.compile(r'.+\.((jpg)\|(ico)\|(rar)\|(zip)\|(doc)\|(ppt)\|(xls)\|(css)\|(exe)\|(pdf))x?$')

58	prefix = re.compile(r'^((javascript:)\|(openapi)).+')

59

60	if postfix.match(utf8_url):

61	continue

62	if prefix.match(utf8_url):

63	continue

64	if not utf8_url.startswith('http://'):

65	#weburl = urljoin_rfc(response.url, weburl, response.encoding)

66	weburl = 'http://'+self.gethostname(response.url)+'/'+weburl

67

68	weburl = re.sub(r'/\.\./\.\./',r'/',weburl)

69	weburl = re.sub(r'/\.\./',r'/',weburl)

70

71	yield Request(weburl, callback=self.parse)

72

73	def gethostname(self, res_url):

74	"""get the host name of a url"""

75	proto, rest = urllib.splittype(res_url)

76	host, rest = urllib.splithost(rest)

77	return host

爬取得到的ITEM会交给PIPELINE处理。
这里的PipeLine做了去重处理，不能简单的放在内容，所以使用的是Bloom Filter的算法，这里直接安装了Python的开源库中的pybloomfilter（有时间研究一下）
view source

01	class DuplicatesPipeline(object):

02

03	def __init__(self):

04	self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')

05	self.f_write = open('visitedsites','w')

06	self.si = SearchIndex()

07	self.si.SearchInit()

08

09	def process_item(self, item, spider):

10	print '**********%d pages visited!***************' %len(self.bf)

11	if self.bf.add(item['url']):#True if item in the BF

12	raise DropItem("Duplicate item found: %s" % item)

13

else:

14	#print '%d pages visited!'% len(self.url_seen)

15	self.save_to_file(item['url'],item['title'])

16	self.si.AddIndex(item)

17	return item

18

19	def save_to_file(self,url,utitle):

20	self.f_write.write(url)

21	self.f_write.write('\t')

22	self.f_write.write(utitle.encode('utf-8'))

23	self.f_write.write('\n')

24

25	def __del__(self):

26	"""docstring for __del__"""

27	self.f_write.close()

28	self.si.IndexDone()

该类中的SearchIndex是ElasticSearch建立索引的类。定义如下：
view source

01	#!/usr/bin/env python

02	#-- coding:utf-8--

03	import os

04	import sys

05	from pyes import *

06	from mymodules.items import Website

07	INDEX_NAME='xidian_spider'

08

09	class SearchIndex(object):

10

11	def SearchInit(self):

12	self.conn = ES('127.0.0.1:9200', timeout=3.5)#Connect to ES

13

try:

14	self.conn.delete_index(INDEX_NAME)

15

#pass

16	except:

17

pass

18	self.conn.create_index(INDEX_NAME)#Create a new INDEX

19

20	#Define the structure of the data format

21	mapping = {u'content': {'boost': 1.0,

22	'index': 'analyzed',

23	'store': 'yes',

24	'type': u'string',

25	"indexAnalyzer":"ik",

26	"searchAnalyzer":"ik",

27	"term_vector" : "with_positions_offsets"},

28	u'title': {'boost': 1.0,

29	'index': 'analyzed',

30	'store': 'yes',

31	'type': u'string',

32	"indexAnalyzer":"ik",

33	"searchAnalyzer":"ik",

34	"term_vector" : "with_positions_offsets"},

35	u'url': {'boost': 1.0,

36	'index': 'analyzed',

37	'store': 'yes',

38	'type': u'string',

39	#"indexAnalyzer":"ik",

40	#"searchAnalyzer":"ik",

41	"term_vector" : "with_positions_offsets"},

42

}

43

44	self.conn.put_mapping("searchEngine-type", {'properties':mapping}, [INDEX_NAME])#Define the type

45

46	def AddIndex(self,item):

47

48	print 'Adding Index item URL %s'% item['title'].encode('utf-8')

49	self.conn.index({'title':item['title'].encode('utf-8'), \

50	'url':item['url'].encode('utf-8'),\

51	'content':item['content'].encode('utf-8')\

52	},INDEX_NAME,'searchEngine-type')

53

54	def IndexDone(self):

55	self.conn.default_indices=[INDEX_NAME]#Set the default indices

56	self.conn.refresh()#Refresh the ES

其中中文分词使用的是IK分词，Python库中直接安装即可。
Django中接受搜索请求的处理函数如下：
view source

01	def search(request):

02	"""docstring for search"""

03	if 'q' in request.GET:

04	q = request.GET['q']

05

print q

06	if 'page' in request.GET:

07	page = unicode(request.GET['page'])

08

else:

09	page = unicode(1)

10	start = clock()

11	results = dosearch(q,page)#connect to ES to return the results

12	end = clock()

13	return render(request,'res_search.html', {'results' : results,

14	'query':q,

15	'count':len(results),

16	'time':end-start,

17	'page':page,

18	'nextpage':int(page)+1})

19

else:

20	message = 'You submitted an empty form.'

21	return HttpResponse(message)

其中调用dosearch函数进行连接ES查询，函数内容如下：
view source

01	def dosearch(string,upage):

02	conn = ES('127.0.0.1:9200', timeout=3.5)#连接ES

03	fq_title = FieldQuery(analyzer='ik')

04	fq_title.add('title',string)

05

06	fq_content = FieldQuery(analyzer='ik')

07	fq_content.add('content',string)

08

09	bq = BoolQuery(should=[fq_title,fq_content])

10

11	h=HighLighter(['['], [']'], fragment_size=100)

12

13	page = int(upage.encode('utf-8'))

14	if page < 1:

15

page = 1

16

17	s=Search(bq,highlight=h,start=(page-1)*PAGE_SIZE,size=PAGE_SIZE)

18	s.add_highlight("content")

19	s.add_highlight('title')

20	results=conn.search(s,indices='xidian_spider',doc_types='searchEngine-type')

21

22

list=[]

23	for r in results:

24	if(r._meta.highlight.has_key("title")):

25	r['title']=r._meta.highlight[u"title"][0]

26	if(r._meta.highlight.has_key('content')):

27	r['content']=r._meta.highlight[u'content'][0]

28

29	res = Results()

30	res.content = r['content']

31	res.title = r['title']

32	res.url = r['url']

33	list.append(res)

34	return list

账号		自动登录	找回密码
密码			立即注册

wirelessnetview好用的无线分析工具

Red Hat RHCE 8 (EX294) Cert Guide

Shell从入门到精通（阿良）

亿图图示专家(EDraw Max) V7.9 中文破解版

zabbix3.4.1安装部署+微信推送信息+大屏显

Red Hat OpenShift I: Containers & Kubern

2025 年，C++ 还能“硬核”多久？

[经验分享] 搜索引擎–基于Django/Scrapy/ElasticSearch的搜索引擎的实现（转）

浏览过的版块

扫码加入运维网微信交流群