设为首页 收藏本站
查看: 455|回复: 0

[经验分享] apache httpclient PDF的网络爬虫

[复制链接]

尚未签到

发表于 2017-1-4 10:22:46 | 显示全部楼层 |阅读模式
  import java.io.File;
  import java.io.FileOutputStream;
  import java.io.IOException;
  import java.io.InputStream;
  import java.net.URLEncoder;
  import java.util.ArrayList;
  import java.util.List;
  import java.util.Timer;
  import java.util.TimerTask;
  import org.apache.http.HttpEntity;
  import org.apache.http.HttpResponse;
  import org.apache.http.client.ClientProtocolException;
  import org.apache.http.client.HttpClient;
  import org.apache.http.client.methods.HttpGet;
  import org.apache.http.conn.ClientConnectionManager;
  import org.apache.http.conn.params.ConnManagerParams;
  import org.apache.http.conn.params.ConnPerRouteBean;
  import org.apache.http.conn.scheme.PlainSocketFactory;
  import org.apache.http.conn.scheme.Scheme;
  import org.apache.http.conn.scheme.SchemeRegistry;
  import org.apache.http.conn.ssl.SSLSocketFactory;
  import org.apache.http.impl.client.DefaultHttpClient;
  import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
  import org.apache.http.params.BasicHttpParams;
  import org.apache.http.params.HttpConnectionParams;
  import org.apache.http.params.HttpParams;
  import org.apache.http.protocol.BasicHttpContext;
  import org.apache.http.protocol.HttpContext;
  import org.htmlparser.Node;
  import org.htmlparser.NodeFilter;
  import org.htmlparser.Parser;
  import org.htmlparser.filters.AndFilter;
  import org.htmlparser.filters.NodeClassFilter;
  import org.htmlparser.tags.LinkTag;
  import org.htmlparser.util.NodeList;
  import org.htmlparser.util.ParserException;
  public class Crawler implements Runnable{
  public static String SAVE="C:/Users/Administrator/Downloads";//下载保存路径
  private String url="";//要抓取的网页地址
  public Crawler(String url){
  this.url=url;
  }
  public Crawler(){}
  /**
  * 
  * @param url 要抓取的网页的地址
  * @return 这个对应的内容
  * @throws ClientProtocolException
  * @throws IOException
  */
  private String crawl(String url) throws ClientProtocolException, IOException{
  System.out.println("[INFO] Crawl From : "+url);
  HttpClient httpClient = new DefaultHttpClient();
  HttpGet httpGet=new HttpGet(url);
  HttpResponse httpResponse = httpClient.execute(httpGet);
  HttpEntity httpEntity=httpResponse.getEntity();
  InputStream inStream=httpEntity.getContent();
  String content="";
  while(true){
  byte[] bytes=new byte[1024*1000];
  int k=inStream.read(bytes);
  if(k>=0)content=content+new String(bytes,0,k);
  else break;
  System.out.println(content);
  System.out.println("=========================================================================================");
  }
  return content;
  }
  public void run(){
  try {
  String prefix=this.url.substring(0,this.url.lastIndexOf("/"));
  String content=this.crawl(this.url);//抓取网页内容
  Parser parser=new Parser(content); //使用HTMLParser对网页内容进行解析
  NodeFilter filter;
  NodeList list;
  filter=new NodeClassFilter(LinkTag.class);
  filter=new AndFilter(filter,new NodeFilter(){
  public boolean accept(Node node) {
  return ((LinkTag)node).isHTTPLink();
  }});
  list=parser.extractAllNodesThatMatch(filter);
  List<String> urlsList =new ArrayList<String>();
  for(int i=0;i<list.size();i++){
  String[] array=list.elementAt(i).getText().split("\"");
  if(array[1].endsWith(".pdf")||array[1].endsWith(".PDF")){//只下载pdf
  String downloadUrl=new String(prefix+"/"+array[1]);
  urlsList.add(downloadUrl);//生成需要下载的地址
  }
  }
  //从这里开始是进行下载,使用了多线程执行请求
  HttpParams params=new BasicHttpParams();
  //ConnManagerParams.setTimeout(params, 60000*3); //设置连接最大等待时间
  ConnManagerParams.setMaxConnectionsPerRoute(params, new ConnPerRouteBean(50));//设置并发数
  //HttpConnectionParams.setConnectionTimeout(params, 60000*2);  //设置连接超时时间
  HttpConnectionParams.setSoTimeout(params, 60000*10);//设置读取超时时间
  SchemeRegistry schemeRegistry=new SchemeRegistry();
  schemeRegistry.register(new Scheme("http",PlainSocketFactory.getSocketFactory(),80));
  schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443)); 
  ThreadSafeClientConnManager cm=new ThreadSafeClientConnManager(params,schemeRegistry);
  HttpClient httpClient=new DefaultHttpClient(cm,params);
  Thread[] threads=new Thread[urlsList.size()];
  int n=0;
  for(String url:urlsList){
  String path=Crawler.SAVE+url.substring(url.lastIndexOf("/"), url.length());
  url=url.substring(0, url.lastIndexOf("/"))+"/"+URLEncoder.encode(url.substring(url.lastIndexOf("/")+1,url.length()),"UTF-8");
  HttpGet httpGet=new HttpGet(url);
  threads[n]=new Thread(new Downloader(httpClient,httpGet,url,path));
  n++;
  }
  for(Thread thread:threads)thread.start();
  for(Thread thread:threads)if(thread.isAlive())thread.join();
  }catch (InterruptedException e) {
  System.out.println("[ERROR] Download InterruptedException : "+e.toString());
  //e.printStackTrace();
  } catch (ParserException e) {
  System.out.println("[ERROR] Parse ParserException : "+e.toString());
  //e.printStackTrace();
  }catch (ClientProtocolException e) {
  System.out.println("[ERROR] Crawl ClientProtocolException : "+e.toString());
  //e.printStackTrace();
  } catch (IOException e) {
  System.out.println("[ERROR] Crawl IOException : "+e.toString());
  //e.printStackTrace();
  }
  }
  public static void main(String[] args) {
  //入口程序
  Crawler crawler=new Crawler("http://www3.tjcu.edu.cn/wangshangketang/yuanneike/guanlixue/sjxz.htm");//这里设定网页地址
  Thread thread=new Thread(crawler);
  thread.start();
  }
  }
  //类Downloader真正的执行了写入网络数据到文件的步骤
  class Downloader implements Runnable{
  private String url="";
  private String path="";
  private final HttpClient httpClient;
  private final HttpContext httpContext;
  private final HttpGet httpGet;
  /**
  * 
  * @param httpClient 多个线程共享的HtppClient
  * @param httpGet 要下载的HttpGet
  * @param url 资源网络地址
  * @param path 资源下载之后本地的保存路径
  */
  public Downloader(HttpClient httpClient,HttpGet httpGet,String url,String path){
  this.httpClient=httpClient;
  this.httpGet=httpGet;
  this.httpContext=new BasicHttpContext();
  this.path=path;
  this.url=url;
  }
  public void run() {
  System.out.println("[INFO] Download From : "+this.url);
  File file=new File(this.path);
  if(file.exists())file.delete();
  try {
  //使用file来写入本地数据
  file.createNewFile();
  FileOutputStream outStream = new FileOutputStream(this.path);
  //执行请求,获得响应
  HttpResponse httpResponse = this.httpClient.execute(this.httpGet,this.httpContext);
  System.out.println("[STATUS] Download : "+httpResponse.getStatusLine()+" [FROM] "+this.path);
  HttpEntity httpEntity=httpResponse.getEntity();
  InputStream inStream=httpEntity.getContent();
  while(true){//这个循环读取网络数据,写入本地文件
  byte[] bytes=new byte[1024*1000];
  int k=inStream.read(bytes);
  if(k>=0){
  outStream.write(bytes,0,k);
  outStream.flush();
  }
  else break;
  }
  inStream.close();
  outStream.close();
  } catch (IOException e){
  this.httpGet.abort();
  System.out.println("[ERROR] Download IOException : "+e.toString()+" [FROM] : "+this.path);
  //e.printStackTrace();
  }
  }
  }

运维网声明 1、欢迎大家加入本站运维交流群:群②:261659950 群⑤:202807635 群⑦870801961 群⑧679858003
2、本站所有主题由该帖子作者发表,该帖子作者与运维网享有帖子相关版权
3、所有作品的著作权均归原作者享有,请您和我们一样尊重他人的著作权等合法权益。如果您对作品感到满意,请购买正版
4、禁止制作、复制、发布和传播具有反动、淫秽、色情、暴力、凶杀等内容的信息,一经发现立即删除。若您因此触犯法律,一切后果自负,我们对此不承担任何责任
5、所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其内容的准确性、可靠性、正当性、安全性、合法性等负责,亦不承担任何法律责任
6、所有作品仅供您个人学习、研究或欣赏,不得用于商业或者其他用途,否则,一切后果均由您自己承担,我们对此不承担任何法律责任
7、如涉及侵犯版权等问题,请您及时通知我们,我们将立即采取措施予以解决
8、联系人Email:admin@iyunv.com 网址:www.yunweiku.com

所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其承担任何法律责任,如涉及侵犯版权等问题,请您及时通知我们,我们将立即处理,联系人Email:kefu@iyunv.com,QQ:1061981298 本贴地址:https://www.yunweiku.com/thread-323715-1-1.html 上篇帖子: 轻轻松松把 Apache 配置安全 下篇帖子: Apache MINA实战之 对象传输
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

扫码加入运维网微信交流群X

扫码加入运维网微信交流群

扫描二维码加入运维网微信交流群,最新一手资源尽在官方微信交流群!快快加入我们吧...

扫描微信二维码查看详情

客服E-mail:kefu@iyunv.com 客服QQ:1061981298


QQ群⑦:运维网交流群⑦ QQ群⑧:运维网交流群⑧ k8s群:运维网kubernetes交流群


提醒:禁止发布任何违反国家法律、法规的言论与图片等内容;本站内容均来自个人观点与网络等信息,非本站认同之观点.


本站大部分资源是网友从网上搜集分享而来,其版权均归原作者及其网站所有,我们尊重他人的合法权益,如有内容侵犯您的合法权益,请及时与我们联系进行核实删除!



合作伙伴: 青云cloud

快速回复 返回顶部 返回列表