python爬虫基础

ybaidukuai 发表于 2018-8-15 12:59:33

#coding:utf-8　　
#爬虫基础，需要两个模块urllib和re
　　
import urllib,re
　　

　　
#获取网页源码
　　
def get_html():
　　page = urllib.urlopen('http://www.baidu.com')
　　html = read(page)#用read方式读取网页源代码
　　return html
　　
x=0
　　
#匹配url的图片地址，然后下载
　　
def getimages():
　　#编译成正则表达式对象，compile提高效率
　　image_re=re.compile(r'src="(.*?)" class=')
　　#找到re匹配的所有字串，通过列表返回
　　image_list = re.findall(image_re,html)
　　for image_url in image_list:
　　print image_url
　　global x #全局变量，后面可以跟上一个或多个变量
　　#将url定位到的html下载到本地
　　urllib.urlretrieve(image_url,'/tmp/python/%s.jpg'%x)

页: [1]

运维网's Archiver

python爬虫基础