tiani 发表于 2015-12-2 01:00:14

用Python实现的一个简单的爬取省市乡镇的行政区划信息的脚本

# coding=utf-8  # Creeper
  import os
  import bs4
  import time
  import MySQLdb
  import urllib2
  import datetime
  import warnings
  import traceback
  import ConfigParser
  
  try:
  basedir = os.path.dirname(os.path.abspath(__file__))
  except NameError:
  import sys
  basedir = os.path.dirname(os.path.abspath(sys.argv))
  
  SETTINGS_FILE = os.path.join(basedir, 'settings.ini')
  GLOBAL_CONFIG = {
  'server': {
  'debug': False,
  },
  'db': {
  'host': '127.0.0.1',
  'port': '3306',
  'user': 'root',
  'password': '',
  'dbname': 'test',
  'table': 'group'
  }
  }
  
  def __config(item):
  GLOBAL_CONFIG] = item
  try:
  parser = ConfigParser.ConfigParser()
  parser.readfp(open(SETTINGS_FILE))
  for sec in parser.sections():
  map(__config, parser.items(sec))
  except:
  print 'settings.ini needed'
  raise
  
  N = 0
  
  class Handle(object):
  
  def __init__(self, *args, **kwargs):
  super(Handle, self).__init__()
  self.db = {}
  self.conn = None
  self.cursor = None
  self._cursor(**kwargs)
  
  def _cursor(self, **kwargs):
  self.db.update(**kwargs)
  host = kwargs.get('host', '127.0.0.1')
  port = int(kwargs.get('port', 3306))
  user = kwargs.get('user', 'root')
  pwd = kwargs.get('password', '')
  dbname = kwargs.get('dbname', 'test')
  charset = kwargs.get('charset', 'utf8')
  _conn = MySQLdb.connect(user=user, passwd=pwd,
  host=host, port=port, )
  try:
  _conn.select_db(dbname)
  except:
  sql = """CREATE DATABASE IF NOT EXISTS `%s` DEFAULT CHARSET utf8 COLLATE utf8_general_ci;
  """ % dbname
  _conn.cursor().execute(sql)
  _conn.select_db(dbname)
  self.conn = _conn
  self.cursor = _conn.cursor()
  
  def format_pk(self, pk, index):
  d = {1:2, 2:4, 3:6, 4:9, 5:12}
  try:
  pk = int(float(pk))
  except:
  raise ValueError, 'the primary key must be integer or string interger'
  return int(str(pk)[:d])
  while (pk * 1.0 / 10).is_integer():
  pk = pk * 1.0 / 10
  return pk < 10 and int(pk) * 10 or int(pk)
  
  def do_execute(self, pk, name, type, parent='NULL'):
  global N
  now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')
  if N%100 == 0 and N != 0:
  print 'Total: %s ---%s' % (N, now)
  # time.sleep(5)
  if not parent or parent == '':
  parent = 'NULL'
  sql = """INSERT INTO `%s` (`id`, `name`, `type`, `parent`) VALUES(%s, '%s', '%s', %s);
  """ % (self.db['table'], pk, name, type, parent)
  try:
  self.cursor.execute(sql)
  self.conn.commit()
  print '+',
  N += 1
  except MySQLdb.Warning, w:
  print "\nWarning:%s" % str(w)
  print '#',
  except MySQLdb.Error, e:
  if not 'Duplicate entry' in str(e):
  print "\nError:%s" % str(e)
  self.debug(pk, name)
  self.debug(pk, parent)
  else:
  print '=',
  self.debug(pk, name)
  except:
  traceback.print_exc()
  print '?',
  
  def do_executemany(self, items, params=None):
  sql = """INSERT INTO `%s` (`id`, `name`, `type`, `parent`) VALUES """ % self.db['table']
  sql +="(%s, %s, %s, %s)";
  try:
  self.cursor.executemany(sql, items)
  self.conn.commit()
  except MySQLdb.Error, e:
  print "Error:%s" % str(e)
  except:
  traceback.print_exc()
  
  
  class Creeper(Handle):
  
  def __init__(self, *args, **kwargs):
  Handle.__init__(self, *args, **kwargs)
  self.init_db()
  self.root_url = ''
  self._type = {1:'province',2:'city',3:'county',4:'town',5:'village'}
  
  def init_db(self):
  __sql = """CREATE TABLE IF NOT EXISTS `%s` (
  `id` bigint(20) NOT NULL,
  `name` varchar(30) NOT NULL,
  `type` varchar(30) NOT NULL,
  `parent` bigint(20) DEFAULT NULL,
  
  PRIMARY KEY (`id`),
  KEY `Group_12345` (`name`),
  KEY `Group_67890` (`parent`),
  FOREIGN KEY(parent) REFERENCES `%s` (`id`)
  ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
  """ % (self.db['table'], self.db['table'])
  
  try:
  self.cursor.execute(__sql)
  self.cursor.execute("""CREATE TABLE IF NOT EXISTS `debug` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `badid` bigint(20) NOT NULL,
  `others` varchar(256) DEFAULT NULL,
  PRIMARY KEY (`id`)
  ) ENGINE=InnoDB DEFAULT CHARSET=utf8;""")
  self.conn.commit()
  except MySQLdb.Warning, w:
  if not 'already exists' in str(w):
  print "Warning:%s" % str(w)
  except MySQLdb.Error, e:
  print "Error:%s" % str(e)
  traceback.print_exc()
  except:
  traceback.print_exc()
  pass
  
  def debug(self, *args):
  try:
  _sql = """INSERT INTO `debug` (`badid`, `others`) VALUES(%s, '%s');
  """ % (args, args)
  self.cursor.execute(_sql)
  self.conn.commit()
  except:
  traceback.print_exc()
  finally:
  return
  
  
  def __get_url(self, tup, index):
  id = tup
  parent = tup[-1]
  __url = []
  for i in range(0, index, 2):
  __url.append(str(id))
  __url.append(str(id))
  return self.root_url + '/'.join(__url) + '.html'
  
  def format_tag(self, x_tag, url, index, limit=None):
  if limit and index > limit: return
  parent = str(url.split('/2013/').split('/')[-1])[:-5]
  __type = self._type
  __parent = not parent and 'NULL' or parent
  if isinstance(x_tag, bs4.element.Tag) and x_tag.has_attr('href'):
  print '.',
  href = x_tag['href']
  child_url = '/'.join(url.split('/')[:-1])
  full_url = '/'.join()
  __pk = self.format_pk(href.split('.html').split('/')[-1], index)
  __name = x_tag.text
  i = (__pk, __name, __type, __parent)
  self.do_execute(*i)
  self.get_info(full_url, index + 1, limit)
  else:
  print '*',
  __pk = self.format_pk(x_tag.text, index)
  __name = x_tag.text
  self.do_execute(__pk, __name, __type, __parent)
  
  def get_info(self, url, index, limit=None):
  # 解析页面,获取目标区域的数据
  # 获取单元数据,格式化为可供插入数据库的元组
  try:
  __html = urllib2.urlopen(url).read()
  __soup = bs4.BeautifulSoup(__html, from_encoding='gbk')

  __tr = __soup('tr',>  except:
  try:
  self.debug(0, url)
  except:
  traceback.print_exc()
  finally:
  return
  __lst = []
  for tr in __tr:
  # 每个tr中的多个td代表多个省
  if index == 1:
  for td in tr('td'):
  __lst.extend(td('a'))
  continue
  
  # 每个tr中的多个td代表一个节点,取最后一个td中的a标签
  if tr('td')[-1]('a'):
  __lst.extend(tr('td')[-1]('a'))
  else:
  # 没有子节点的元素,单纯的通过td中的数据创建
  self.format_tag(, tr('td')[-1]], url, index, limit)
  
  for a in __lst:
  self.format_tag(a, url, index, limit)
  
  
  def do_get_childs(self, index):
  # 1. 查询获取当前层级节点数目
  _sql = """SELECT COUNT( * ) FROM`%s` WHERE `type`='%s';
  """ % (self.db['table'], self._type)
  self.cursor.execute(_sql)
  total = self.cursor.fetchone()
  
  # 2. 遍历父节点,获取子节点数据
  for i in range(0, total, 30):
  _sql = """SELECT `id`, `parent` FROM `%s` WHERE `type` = '%s' LIMIT %s, %s;
  """ % (self.db['table'], self._type, i, i+30)
  self.cursor.execute(_sql)
  id_and_parent = list(set(self.cursor.fetchall()))

  for tup in>  url = self.__get_url(tup, index)
  self.get_info(url, index+1)
  
  
  warnings.filterwarnings('error', category = MySQLdb.Warning)
  del warnings
  
  if __name__ == '__main__':
  url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/"
  db_config = GLOBAL_CONFIG['db']
  g = Creeper(**db_config)
  g.root_url = url
  g.init_db()
  lst = g.get_info(url, index=1, limit=5)
  # g.do_get_childs(3)
页: [1]
查看完整版本: 用Python实现的一个简单的爬取省市乡镇的行政区划信息的脚本