|
使用apache的HttpClient实现网页抓取功能
/**
* 根据url抓取字符串 返回字符串
*
* @param urlstr
* String
* @return String
*/
public static String snatch(String urlstr, String encode) {
String rs = "";
// Create an instance of HttpClient.
HttpClient client = new HttpClient();
// Create a method instance.
GetMethod method = new GetMethod(urlstr);
// Provide custom retry handler is necessary
method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler(0, false));
// method.getParams().setParameter(HttpMethodParams.USER_AGENT,
// "Mozilla/4.0 (compatible; MSIE 6.0; Windows 2000)");
try {
// Execute the method.
int statusCode = client.executeMethod(method);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: " + statusCode);
System.err.println("Method failed: " + method.getStatusLine());
}else{
// Read the response body.
byte[] responseBody = method.getResponseBody();
// Deal with the response.
// Use caution: ensure correct character encoding and is not binary
// data
if (!method.getResponseCharSet().trim().equalsIgnoreCase(
"ISO-8859-1")) {
rs = new String(responseBody, method.getResponseCharSet());
} else {
if (encode != null && encode.length() > 0)
rs = new String(responseBody, encode);
else
rs = new String(responseBody, "gb2312");
}
}
} catch (HttpException e) {
System.err.println("Fatal protocol violation: " + e.getMessage());
e.printStackTrace();
} catch (IOException e) {
System.err.println("Fatal transport error: " + e.getMessage());
System.err.println("=============" + urlstr);
e.printStackTrace();
}catch(java.lang.IllegalArgumentException e){
System.err.println("报错的url是:"+urlstr);
e.printStackTrace();
} finally {
// Release the connection.
method.releaseConnection();
}
return rs;
}
以上代码就是一个简单的HttpClient远程抓取页面源码了,不过记得要
import org.apache.commons.httpclient.*;
还有就是中间注释掉的代码
// method.getParams().setParameter(HttpMethodParams.USER_AGENT,
// "Mozilla/4.0 (compatible; MSIE 6.0; Windows 2000)");
大家要注意点,因为有些网站是设置的是防止爬虫形式抓取的,所以如果有些时候你抓取不到的时候你需要加上这段代码了!
直接调用snatch("url地址一定要加http://",“编码”)方法 |
|