用Python实现的一个简单的爬取省市乡镇的行政区划信息的脚本

tiani · 发表于 2015-12-2 01:00:14

# coding=utf-8　　# Creeper
　　import os
　　import bs4
　　import time
　　import MySQLdb
　　import urllib2
　　import datetime
　　import warnings
　　import traceback
　　import ConfigParser
　　
　　try:
　　basedir = os.path.dirname(os.path.abspath(__file__))
　　except NameError:
　　import sys
　　basedir = os.path.dirname(os.path.abspath(sys.argv[0]))
　　
　　SETTINGS_FILE = os.path.join(basedir, 'settings.ini')
　　GLOBAL_CONFIG = {
　　'server': {
　　'debug': False,
　　},
　　'db': {
　　'host': '127.0.0.1',
　　'port': '3306',
　　'user': 'root',
　　'password': '',
　　'dbname': 'test',
　　'table': 'group'
　　}
　　}
　　
　　def __config(item):
　　GLOBAL_CONFIG[sec][item[0]] = item[1]
　　try:
　　parser = ConfigParser.ConfigParser()
　　parser.readfp(open(SETTINGS_FILE))
　　for sec in parser.sections():
　　map(__config, parser.items(sec))
　　except:
　　print 'settings.ini needed'
　　raise
　　
　　N = 0
　　
　　class Handle(object):
　　
　　def __init__(self, *args, **kwargs):
　　super(Handle, self).__init__()
　　self.db = {}
　　self.conn = None
　　self.cursor = None
　　self._cursor(**kwargs)
　　
　　def _cursor(self, **kwargs):
　　self.db.update(**kwargs)
　　host = kwargs.get('host', '127.0.0.1')
　　port = int(kwargs.get('port', 3306))
　　user = kwargs.get('user', 'root')
　　pwd = kwargs.get('password', '')
　　dbname = kwargs.get('dbname', 'test')
　　charset = kwargs.get('charset', 'utf8')
　　_conn = MySQLdb.connect(user=user, passwd=pwd,
　　host=host, port=port, )
　　try:
　　_conn.select_db(dbname)
　　except:
　　sql = """CREATE DATABASE IF NOT EXISTS `%s` DEFAULT CHARSET utf8 COLLATE utf8_general_ci;
　　""" % dbname
　　_conn.cursor().execute(sql)
　　_conn.select_db(dbname)
　　self.conn = _conn
　　self.cursor = _conn.cursor()
　　
　　def format_pk(self, pk, index):
　　d = {1:2, 2:4, 3:6, 4:9, 5:12}
　　try:
　　pk = int(float(pk))
　　except:
　　raise ValueError, 'the primary key must be integer or string interger'
　　return int(str(pk)[:d[index]])
　　while (pk * 1.0 / 10).is_integer():
　　pk = pk * 1.0 / 10
　　return pk < 10 and int(pk) * 10 or int(pk)
　　
　　def do_execute(self, pk, name, type, parent='NULL'):
　　global N
　　now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')
　　if N%100 == 0 and N != 0:
　　print 'Total: %s ---%s' % (N, now)
　　# time.sleep(5)
　　if not parent or parent == '':
　　parent = 'NULL'
　　sql = """INSERT INTO `%s` (`id`, `name`, `type`, `parent`) VALUES(%s, '%s', '%s', %s);
　　""" % (self.db['table'], pk, name, type, parent)
　　try:
　　self.cursor.execute(sql)
　　self.conn.commit()
　　print '+',
　　N += 1
　　except MySQLdb.Warning, w:
　　print "\nWarning:%s" % str(w)
　　print '#',
　　except MySQLdb.Error, e:
　　if not 'Duplicate entry' in str(e):
　　print "\nError:%s" % str(e)
　　self.debug(pk, name)
　　self.debug(pk, parent)
　　else:
　　print '=',
　　self.debug(pk, name)
　　except:
　　traceback.print_exc()
　　print '?',
　　
　　def do_executemany(self, items, params=None):
　　sql = """INSERT INTO `%s` (`id`, `name`, `type`, `parent`) VALUES """ % self.db['table']
　　sql +="(%s, %s, %s, %s)";
　　try:
　　self.cursor.executemany(sql, items)
　　self.conn.commit()
　　except MySQLdb.Error, e:
　　print "Error:%s" % str(e)
　　except:
　　traceback.print_exc()
　　
　　
　　class Creeper(Handle):
　　
　　def __init__(self, *args, **kwargs):
　　Handle.__init__(self, *args, **kwargs)
　　self.init_db()
　　self.root_url = ''
　　self._type = {1:'province',2:'city',3:'county',4:'town',5:'village'}
　　
　　def init_db(self):
　　__sql = """CREATE TABLE IF NOT EXISTS `%s` (
　　`id` bigint(20) NOT NULL,
　　`name` varchar(30) NOT NULL,
　　`type` varchar(30) NOT NULL,
　　`parent` bigint(20) DEFAULT NULL,
　　
　　PRIMARY KEY (`id`),
　　KEY `Group_12345` (`name`),
　　KEY `Group_67890` (`parent`),
　　FOREIGN KEY(parent) REFERENCES `%s` (`id`)
　　) ENGINE=InnoDB DEFAULT CHARSET=utf8;
　　""" % (self.db['table'], self.db['table'])
　　
　　try:
　　self.cursor.execute(__sql)
　　self.cursor.execute("""CREATE TABLE IF NOT EXISTS `debug` (
　　`id` int(11) NOT NULL AUTO_INCREMENT,
　　`badid` bigint(20) NOT NULL,
　　`others` varchar(256) DEFAULT NULL,
　　PRIMARY KEY (`id`)
　　) ENGINE=InnoDB DEFAULT CHARSET=utf8;""")
　　self.conn.commit()
　　except MySQLdb.Warning, w:
　　if not 'already exists' in str(w):
　　print "Warning:%s" % str(w)
　　except MySQLdb.Error, e:
　　print "Error:%s" % str(e)
　　traceback.print_exc()
　　except:
　　traceback.print_exc()
　　pass
　　
　　def debug(self, *args):
　　try:
　　_sql = """INSERT INTO `debug` (`badid`, `others`) VALUES(%s, '%s');
　　""" % (args[0], args[1])
　　self.cursor.execute(_sql)
　　self.conn.commit()
　　except:
　　traceback.print_exc()
　　finally:
　　return
　　
　　
　　def __get_url(self, tup, index):
　　id = tup[0]
　　parent = tup[-1]
　　__url = []
　　for i in range(0, index, 2):
　　__url.append(str(id)[i: i+2])
　　__url.append(str(id))
　　return self.root_url + '/'.join(__url) + '.html'
　　
　　def format_tag(self, x_tag, url, index, limit=None):
　　if limit and index > limit: return
　　parent = str(url.split('/2013/')[1].split('/')[-1])[:-5]
　　__type = self._type[index]
　　__parent = not parent and 'NULL' or parent
　　if isinstance(x_tag, bs4.element.Tag) and x_tag.has_attr('href'):
　　print '.',
　　href = x_tag['href']
　　child_url = '/'.join(url.split('/')[:-1])
　　full_url = '/'.join([child_url, href])
　　__pk = self.format_pk(href.split('.html')[0].split('/')[-1], index)
　　__name = x_tag.text
　　i = (__pk, __name, __type, __parent)
　　self.do_execute(*i)
　　self.get_info(full_url, index + 1, limit)
　　else:
　　print '*',
　　__pk = self.format_pk(x_tag[0].text, index)
　　__name = x_tag[1].text
　　self.do_execute(__pk, __name, __type, __parent)
　　
　　def get_info(self, url, index, limit=None):
　　# 解析页面,获取目标区域的数据
　　# 获取单元数据,格式化为可供插入数据库的元组
　　try:
　　__html = urllib2.urlopen(url).read()
　　__soup = bs4.BeautifulSoup(__html, from_encoding='gbk')

　　__tr = __soup('tr',>　　except:
　　try:
　　self.debug(0, url)
　　except:
　　traceback.print_exc()
　　finally:
　　return
　　__lst = []
　　for tr in __tr:
　　# 每个tr中的多个td代表多个省
　　if index == 1:
　　for td in tr('td'):
　　__lst.extend(td('a'))
　　continue
　　
　　# 每个tr中的多个td代表一个节点,取最后一个td中的a标签
　　if tr('td')[-1]('a'):
　　__lst.extend(tr('td')[-1]('a'))
　　else:
　　# 没有子节点的元素,单纯的通过td中的数据创建
　　self.format_tag([tr('td')[0], tr('td')[-1]], url, index, limit)
　　
　　for a in __lst:
　　self.format_tag(a, url, index, limit)
　　
　　
　　def do_get_childs(self, index):
　　# 1. 查询获取当前层级节点数目
　　_sql = """SELECT COUNT( * ) FROM `%s` WHERE `type`='%s';
　　""" % (self.db['table'], self._type[index])
　　self.cursor.execute(_sql)
　　total = self.cursor.fetchone()[0]
　　
　　# 2. 遍历父节点,获取子节点数据
　　for i in range(0, total, 30):
　　_sql = """SELECT `id`, `parent` FROM `%s` WHERE `type` = '%s' LIMIT %s, %s;
　　""" % (self.db['table'], self._type[index], i, i+30)
　　self.cursor.execute(_sql)
　　id_and_parent = list(set(self.cursor.fetchall()))

　　for tup in>　　url = self.__get_url(tup, index)
　　self.get_info(url, index+1)
　　
　　
　　warnings.filterwarnings('error', category = MySQLdb.Warning)
　　del warnings
　　
　　if __name__ == '__main__':
　　url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/"
　　db_config = GLOBAL_CONFIG['db']
　　g = Creeper(**db_config)
　　g.root_url = url
　　g.init_db()
　　lst = g.get_info(url, index=1, limit=5)
　　# g.do_get_childs(3)

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] 用Python实现的一个简单的爬取省市乡镇的行政区划信息的脚本

扫码加入运维网微信交流群