apache httpclient PDF的网络爬虫

andy27367451 · 发表于 2017-1-4 10:22:46

　　import java.io.File;
　　import java.io.FileOutputStream;
　　import java.io.IOException;
　　import java.io.InputStream;
　　import java.net.URLEncoder;
　　import java.util.ArrayList;
　　import java.util.List;
　　import java.util.Timer;
　　import java.util.TimerTask;
　　import org.apache.http.HttpEntity;
　　import org.apache.http.HttpResponse;
　　import org.apache.http.client.ClientProtocolException;
　　import org.apache.http.client.HttpClient;
　　import org.apache.http.client.methods.HttpGet;
　　import org.apache.http.conn.ClientConnectionManager;
　　import org.apache.http.conn.params.ConnManagerParams;
　　import org.apache.http.conn.params.ConnPerRouteBean;
　　import org.apache.http.conn.scheme.PlainSocketFactory;
　　import org.apache.http.conn.scheme.Scheme;
　　import org.apache.http.conn.scheme.SchemeRegistry;
　　import org.apache.http.conn.ssl.SSLSocketFactory;
　　import org.apache.http.impl.client.DefaultHttpClient;
　　import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
　　import org.apache.http.params.BasicHttpParams;
　　import org.apache.http.params.HttpConnectionParams;
　　import org.apache.http.params.HttpParams;
　　import org.apache.http.protocol.BasicHttpContext;
　　import org.apache.http.protocol.HttpContext;
　　import org.htmlparser.Node;
　　import org.htmlparser.NodeFilter;
　　import org.htmlparser.Parser;
　　import org.htmlparser.filters.AndFilter;
　　import org.htmlparser.filters.NodeClassFilter;
　　import org.htmlparser.tags.LinkTag;
　　import org.htmlparser.util.NodeList;
　　import org.htmlparser.util.ParserException;
　　public class Crawler implements Runnable{
　　public static String SAVE="C:/Users/Administrator/Downloads";//下载保存路径
　　private String url="";//要抓取的网页地址
　　public Crawler(String url){
　　this.url=url;
　　}
　　public Crawler(){}
　　/**
　　*
　　* @param url 要抓取的网页的地址
　　* @return 这个对应的内容
　　* @throws ClientProtocolException
　　* @throws IOException
　　*/
　　private String crawl(String url) throws ClientProtocolException, IOException{
　　System.out.println("[INFO] Crawl From : "+url);
　　HttpClient httpClient = new DefaultHttpClient();
　　HttpGet httpGet=new HttpGet(url);
　　HttpResponse httpResponse = httpClient.execute(httpGet);
　　HttpEntity httpEntity=httpResponse.getEntity();
　　InputStream inStream=httpEntity.getContent();
　　String content="";
　　while(true){
　　byte[] bytes=new byte[1024*1000];
　　int k=inStream.read(bytes);
　　if(k>=0)content=content+new String(bytes,0,k);
　　else break;
　　System.out.println(content);
　　System.out.println("=========================================================================================");
　　}
　　return content;
　　}
　　public void run(){
　　try {
　　String prefix=this.url.substring(0,this.url.lastIndexOf("/"));
　　String content=this.crawl(this.url);//抓取网页内容
　　Parser parser=new Parser(content); //使用HTMLParser对网页内容进行解析
　　NodeFilter filter;
　　NodeList list;
　　filter=new NodeClassFilter(LinkTag.class);
　　filter=new AndFilter(filter,new NodeFilter(){
　　public boolean accept(Node node) {
　　return ((LinkTag)node).isHTTPLink();
　　}});
　　list=parser.extractAllNodesThatMatch(filter);
　　List<String> urlsList =new ArrayList<String>();
　　for(int i=0;i<list.size();i++){
　　String[] array=list.elementAt(i).getText().split("\"");
　　if(array[1].endsWith(".pdf")||array[1].endsWith(".PDF")){//只下载pdf
　　String downloadUrl=new String(prefix+"/"+array[1]);
　　urlsList.add(downloadUrl);//生成需要下载的地址
　　}
　　}
　　//从这里开始是进行下载，使用了多线程执行请求
　　HttpParams params=new BasicHttpParams();
　　//ConnManagerParams.setTimeout(params, 60000*3); //设置连接最大等待时间
　　ConnManagerParams.setMaxConnectionsPerRoute(params, new ConnPerRouteBean(50));//设置并发数
　　//HttpConnectionParams.setConnectionTimeout(params, 60000*2); //设置连接超时时间
　　HttpConnectionParams.setSoTimeout(params, 60000*10);//设置读取超时时间
　　SchemeRegistry schemeRegistry=new SchemeRegistry();
　　schemeRegistry.register(new Scheme("http",PlainSocketFactory.getSocketFactory(),80));
　　schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443));
　　ThreadSafeClientConnManager cm=new ThreadSafeClientConnManager(params,schemeRegistry);
　　HttpClient httpClient=new DefaultHttpClient(cm,params);
　　Thread[] threads=new Thread[urlsList.size()];
　　int n=0;
　　for(String url:urlsList){
　　String path=Crawler.SAVE+url.substring(url.lastIndexOf("/"), url.length());
　　url=url.substring(0, url.lastIndexOf("/"))+"/"+URLEncoder.encode(url.substring(url.lastIndexOf("/")+1,url.length()),"UTF-8");
　　HttpGet httpGet=new HttpGet(url);
　　threads[n]=new Thread(new Downloader(httpClient,httpGet,url,path));
　　n++;
　　}
　　for(Thread thread:threads)thread.start();
　　for(Thread thread:threads)if(thread.isAlive())thread.join();
　　}catch (InterruptedException e) {
　　System.out.println("[ERROR] Download InterruptedException : "+e.toString());
　　//e.printStackTrace();
　　} catch (ParserException e) {
　　System.out.println("[ERROR] Parse ParserException : "+e.toString());
　　//e.printStackTrace();
　　}catch (ClientProtocolException e) {
　　System.out.println("[ERROR] Crawl ClientProtocolException : "+e.toString());
　　//e.printStackTrace();
　　} catch (IOException e) {
　　System.out.println("[ERROR] Crawl IOException : "+e.toString());
　　//e.printStackTrace();
　　}
　　}
　　public static void main(String[] args) {
　　//入口程序
　　Crawler crawler=new Crawler("http://www3.tjcu.edu.cn/wangshangketang/yuanneike/guanlixue/sjxz.htm");//这里设定网页地址
　　Thread thread=new Thread(crawler);
　　thread.start();
　　}
　　}
　　//类Downloader真正的执行了写入网络数据到文件的步骤
　　class Downloader implements Runnable{
　　private String url="";
　　private String path="";
　　private final HttpClient httpClient;
　　private final HttpContext httpContext;
　　private final HttpGet httpGet;
　　/**
　　*
　　* @param httpClient 多个线程共享的HtppClient
　　* @param httpGet 要下载的HttpGet
　　* @param url 资源网络地址
　　* @param path 资源下载之后本地的保存路径
　　*/
　　public Downloader(HttpClient httpClient,HttpGet httpGet,String url,String path){
　　this.httpClient=httpClient;
　　this.httpGet=httpGet;
　　this.httpContext=new BasicHttpContext();
　　this.path=path;
　　this.url=url;
　　}
　　public void run() {
　　System.out.println("[INFO] Download From : "+this.url);
　　File file=new File(this.path);
　　if(file.exists())file.delete();
　　try {
　　//使用file来写入本地数据
　　file.createNewFile();
　　FileOutputStream outStream = new FileOutputStream(this.path);
　　//执行请求，获得响应
　　HttpResponse httpResponse = this.httpClient.execute(this.httpGet,this.httpContext);
　　System.out.println("[STATUS] Download : "+httpResponse.getStatusLine()+" [FROM] "+this.path);
　　HttpEntity httpEntity=httpResponse.getEntity();
　　InputStream inStream=httpEntity.getContent();
　　while(true){//这个循环读取网络数据，写入本地文件
　　byte[] bytes=new byte[1024*1000];
　　int k=inStream.read(bytes);
　　if(k>=0){
　　outStream.write(bytes,0,k);
　　outStream.flush();
　　}
　　else break;
　　}
　　inStream.close();
　　outStream.close();
　　} catch (IOException e){
　　this.httpGet.abort();
　　System.out.println("[ERROR] Download IOException : "+e.toString()+" [FROM] : "+this.path);
　　//e.printStackTrace();
　　}
　　}
　　}

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] apache httpclient PDF的网络爬虫

浏览过的版块

扫码加入运维网微信交流群