neocai 发表于 2017-1-13 10:25:32

使用 Apache HttpClient 工具模拟百度蜘蛛或浏览器抓取和解压gzip网页

  package httpclient;
  import java.io.IOException;
  import java.io.InputStream;
  import java.io.InputStreamReader;
  import java.util.zip.GZIPInputStream;
  import org.apache.commons.httpclient.HttpClient;
  import org.apache.commons.httpclient.HttpException;
  import org.apache.commons.httpclient.methods.GetMethod;
  /**
  * 使用 Apache HttpClient 工具模拟百度蜘蛛或浏览器抓取和解压gzip网页
  * @author Ivan
  *
  */
  public class HttpClientTest {
  /**
   * @param args
   * @throws IOException
   * @throws HttpException
   */
  public static void main(String[] args) throws HttpException, IOException {
  HttpClient httpclient = new HttpClient();// 创建一个客户端,类似打开一个浏览器
  // httpclient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
  // httpclient.getParams().setParameter("http.protocol.single-cookie-header",true);
  GetMethod getMethod = new GetMethod("http://www.iteye.com");//http://itindex.net
  // google
  // getMethod.setRequestHeader("Host", "laohuang.iteye.com");
  // getMethod.setRequestHeader("Connection", "Keep-Alive");
  // getMethod.setRequestHeader("Accept", "*/*");
  // getMethod.setRequestHeader("From", "goolebot@googlebot.com");
  // getMethod.setRequestHeader("User-Agent",
  // "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)");
  // getMethod.setRequestHeader("Accept-Encoding", "gzip, deflate");
  // baidu
  getMethod.setRequestHeader("Host", "www.iteye .com");//itindex.net
  getMethod.setRequestHeader("Connection", "Keep-Alive");
  getMethod.setRequestHeader("Accept", "*/*");
  getMethod.setRequestHeader("From", "goolebot@googlebot.com");
  getMethod
  .setRequestHeader(
  "User-Agent",
  "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
  getMethod.setRequestHeader("Accept-Encoding", "gzip");
  int statusCode = httpclient.executeMethod(getMethod);
  System.out.println(getMethod.getResponseCharSet());
  System.out.println(getMethod.getResponseHeader("Content-Encoding"));
  System.out.println(getBodyAsString(getMethod,getMethod.getResponseCharSet()));
  }
  public static String getBodyAsString(GetMethod getHC, String charset)
  throws IOException {
  String acceptEncoding = "";
  if (getHC.getResponseHeader("Content-Encoding") != null)
  acceptEncoding = getHC.getResponseHeader("Content-Encoding")
  .getValue();
  StringBuffer sb = new StringBuffer();
  if (acceptEncoding.toLowerCase().indexOf("gzip") > -1) {
  // 建立gzip解压工作流
  InputStream is = getHC.getResponseBodyAsStream();
  GZIPInputStream gzin = new GZIPInputStream(is);
  InputStreamReader isr = new InputStreamReader(gzin, charset); // 设置读取流的编码格式,自定义编码
  java.io.BufferedReader br = new java.io.BufferedReader(isr);
  String tempbf;
  while ((tempbf = br.readLine()) != null) {
  sb.append(tempbf);
  sb.append("\r\n");
  }
  isr.close();
  gzin.close();
  } else {
  InputStreamReader isr = new InputStreamReader(getHC
  .getResponseBodyAsStream(), charset); // 设置读取流的编码格式,自定义编码
  java.io.BufferedReader br = new java.io.BufferedReader(isr);
  String tempbf;
  while ((tempbf = br.readLine()) != null) {
  sb.append(tempbf);
  sb.append("\r\n");
  }
  isr.close();
  }
  getHC.abort();
  getHC.releaseConnection();
  return sb.toString();
  }
  }
  Via http://itindex.net
页: [1]
查看完整版本: 使用 Apache HttpClient 工具模拟百度蜘蛛或浏览器抓取和解压gzip网页