HttpBot网络爬虫

HttpBot 是对 java.net.HttpURLConnection类的简单封装,可以方便的获取网页内容,并且自动管理session,自动处理301重定向等。虽然不能像HttpClient那样强大,支持完整的Http协议,但却非常地灵活,可以满足我目前所有的相关需求。

获取Google首页Html代码仅需要如下两行代码即可: HttpBot httpBot=new HttpBot(); String html=httpBot.doGet(“http://www.google.com”); 感兴趣的可以到 http://hijava.googlecode.com/svn/HttpBot (SVN地址) 下载最新的源码。目前还太简单(200行代码),不过仍在不断维护中。

HttpBot部分源代码: package org.hijava.httpbot;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;

/**
 * Http网络爬虫类
 *
 * @author yava
 */
public class HttpBot {
	private Map cookieMap;
	private String userAgent;
	private String encoding;
	private String host;
	private String referer;
	private int responseCode;
	private static final String separator = System.getProperty("line.separator");
	private static final String GET="GET";
	private static final String POST="POST";

	public HttpBot() {
		this.userAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5";
		this.encoding = "UTF-8";
		cookieMap=new HashMap();
	}

	public HttpBot(String host,String referer){
		this();
		this.host=host;
		this.referer=referer;
	}
	/**
	 * 以GET方式发送请求,返回页面html代码
	 * @param urlStr
	 * @return
	 */
	public String doGet(String urlStr) {
		HttpURLConnection http=getConnection(urlStr,HttpBot.GET);
		String content="";
		try {
			http.connect();
			processCookie(http);
			this.referer=urlStr;
			this.responseCode=http.getResponseCode();
			if(this.responseCode==302){
				String location=http.getHeaderField("Location");
				return doGet(location);
			}
			InputStream is=http.getInputStream();
			content=getContent(is);
			is.close();
			http.disconnect();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return content;
	}
	/**
	 * 以Post方式发送请求,返回页面html代码
	 * @param urlStr
	 * @param paramMap 请求参数
	 * @return
	 */
	public String doPost(String urlStr,Map paramMap){
		HttpURLConnection http=getConnection(urlStr,HttpBot.POST);
		http.setDoOutput(true);
		String content="";
		try {
			OutputStream os = http.getOutputStream();
			os.write(getParamBytes(paramMap));
			http.connect();
			processCookie(http);
			this.referer=urlStr;
			this.responseCode=http.getResponseCode();
			if(this.responseCode==302){
				String location=http.getHeaderField("Location");
				return doGet(location);
			}
			InputStream is=http.getInputStream();
			content=getContent(is);
			is.close();
			http.disconnect();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return content;
	}
	/**
	 * 从输入流获取网页内容
	 * @param is
	 * @return
	 */
	private String getContent(InputStream is){
		StringBuilder builder = new StringBuilder();
		try {
			BufferedReader reader = new BufferedReader(new InputStreamReader(is,encoding));
			String line;
			while ((line = reader.readLine()) != null) {
				builder.append(line).append(separator);
			}
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}

		return builder.toString();
	}

	private HttpURLConnection getConnection(String urlStr,String reqMethod){
		HttpURLConnection http=null;
		try {
			URL url = new URL(urlStr);
			http = (HttpURLConnection) url.openConnection();
			http.setRequestProperty("User-Agent", this.userAgent);
			http.setRequestProperty("Host", this.host);
			http.setRequestProperty("Cookie", this.getCookie());
			http.setRequestProperty("Referer", this.referer);
			http.setRequestMethod(reqMethod);
			http.setInstanceFollowRedirects(false);
		} catch (IOException e) {
			e.printStackTrace();
		}

		return http;
	}
	/**
	 * 处理cookie
	 * @param http
	 */
	private void processCookie(HttpURLConnection http){
		String key = null;
		for (int i = 1; (key = http.getHeaderFieldKey(i)) != null; i++) {
			if (key.equalsIgnoreCase("set-cookie")) {
				String cookie = null;
				cookie = http.getHeaderField(i);
				int i1=cookie.indexOf("=");
				int i2=cookie.indexOf(";");
				if(i1!=-1&&i2!=-1){
					String _value=cookie.substring(i1+1, i2);
					if("EXPIRED".equalsIgnoreCase(_value)){
						continue;
					}
					String _key=cookie.substring(0, i1);
					cookieMap.put(_key, _value);
				}
			}
		}
	}
	/**
	 * 获取cookie
	 * @return
	 */
	public String getCookie(){
		String cookie="";
		for(Map.Entry entry:cookieMap.entrySet()){
			cookie=cookie+entry.getKey()+"="+entry.getValue()+";";
		}
		return cookie;
	}

	private byte[] getParamBytes(Map paramMap){
		String paramStr="";
		for(Map.Entry entry:paramMap.entrySet()){
			String value="";
			try {
				value=URLEncoder.encode(entry.getValue(), "UTF-8");
			} catch (UnsupportedEncodingException e) {
				e.printStackTrace();
			}
			paramStr=paramStr+entry.getKey()+"="+value+"&";
		}
		return paramStr.getBytes();
	}
}
yava /
in categories tagged with