Java实现网络爬虫功能代码
深山老妖浏览:7042019-03-15 07:49:55本文累计收益:0我也要赚钱

自己爬取的网站内容为12306的余票查询模块。利用火狐浏览器为Web开发者置的Web控制台,可得到爬取网页的请求网址,如下图所示:


其中的请求网址即为我们需要爬取的网址。另外,可得知其请求协议采用的是Https协议,采用GET方式访问。爬取源代码如下所示:

public static String queryDate = "2015-04-19";
        public static String from_station = "JNK";
        public static String to_station = "BJP";
     
        public static void main(String[] args) throws Exception {
     
            HostnameVerifier hv = new HostnameVerifier() {
                public boolean verify(String urlHostName, SSLSession session) {
                    System.out.println("Warning: URL Host: " + urlHostName
                            + " vs. " + session.getPeerHost());
                    return true;
                }
            };
     
            String url = "https://kyfw.12306.cn/otn/lcxxcx/query?purpose_codes=ADULT&queryDate="
                    + queryDate
                    + "&from_station="
                    + from_station
                    + "&to_station="
                    + to_station;
     
            ProtocolUrlValidator.trustAllHttpsCertificates();
            HttpsURLConnection.setDefaultHostnameVerifier(hv);
     
            String result = WebServiceUtil.invokeByHTTPGET(url, null);
            
            Gson gson = new Gson();
            Trains trains = gson.fromJson(result, Trains.class);
            
            List<Item> items = trains.getData().getItems();
            
            if (trains.getHttpstatus() != 200) {
                trains.getMessages();
            } else {
                if (items != null && items.size() != 0)
                    for (Item item : items) {
                        System.out.println(item);
                    }
            }
        }
    }

由于使用的协议为Https,故访问之前需要先进行证书的校验。其中蓝色代码块为我们需要访问的网址,涉及到的invokeByHTTPGET(url,null)代码如下所示:

public class WebServiceUtil {
     
        /**
         * 通过SOAP1.1协议调用Web服务
         *
         * @param wsdl        WSDL路径
         * @param method    方法名
         * @param namespace    命名空间
         * @param headerParameters 头参数
         * @param bodyParameters   体参数
         * @param isBodyParametersNS 体参数是否有命名空间
         * @return    String
         * @throws Exception
         */
        public static String invokeBySoap11(String wsdl, String method,
                String namespace, Map<String, String> headerParameters,
                Map<String, String> bodyParameters, boolean isBodyParametersNS)
                throws Exception {
            StringBuffer soapOfResult = null;
            // 去除 ?wsdl,获取方法列表
            int length = wsdl.length();
            wsdl = wsdl.substring(0, length - 5);
            URL url = new URL(wsdl);
            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod("POST");
            conn.setDoInput(true);
            conn.setDoOutput(true);
            conn.setRequestProperty("Content-Type", "text/xml;charset=utf-8");
            OutputStream out = conn.getOutputStream();
            // 获取soap1.1版本消息
            StringBuilder sb = new StringBuilder();
            sb.append("<soap:Envelope xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"
                    xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\" xmlns:soap=\"http://schemas.xmlsoap.org/soap/envelope/\" ");
            sb.append("xmlns:ns0=\"" + namespace + "\"");
            sb.append(">");
            if (headerParameters != null) {
                sb.append("<soap:Header>");
                for (Entry<String, String> headerParameter : headerParameters
                        .entrySet()) {
                    sb.append("<ns0:");
                    sb.append(headerParameter.getKey());
                    sb.append(">");
                    sb.append(headerParameter.getValue());
                    sb.append("</ns0:");
                    sb.append(headerParameter.getKey());
                    sb.append(">");
                }
                sb.append("</soap:Header>");
            }
            sb.append("<soap:Body><ns0:");
            sb.append(method);
            sb.append(">");
            // 输入参数
            if (bodyParameters != null) {
                for (Entry<String, String> inputParameter : bodyParameters
                        .entrySet()) {
                    if (isBodyParametersNS) {
                        sb.append("<ns0:");
                        sb.append(inputParameter.getKey());
                        sb.append(">");
                        sb.append(inputParameter.getValue());
                        sb.append("</ns0:");
                        sb.append(inputParameter.getKey());
                        sb.append(">");
                    } else {
                        sb.append("<");
                        sb.append(inputParameter.getKey());
                        sb.append(">");
                        sb.append(inputParameter.getValue());
                        sb.append("</");
                        sb.append(inputParameter.getKey());
                        sb.append(">");
                    }
                }
            }
            sb.append("</ns0:");
            sb.append(method);
            sb.append("></soap:Body></soap:Envelope>");
            //System.out.println(sb.toString());
            out.write(sb.toString().getBytes());
            int code = conn.getResponseCode();
            if (code == 200) {
                InputStream is = conn.getInputStream();
                byte[] b = new byte[1024];
                int len = 0;
                soapOfResult = new StringBuffer();
                while ((len = is.read(b)) != -1) {
                    String s = new String(b, 0, len, "UTF-8");
                    soapOfResult.append(s);
                }
            }
            conn.disconnect();
            return soapOfResult == null ? null : soapOfResult.toString();
        }
     
        /**
         * 通过SOAP1.2协议调用Web服务
         *
         * @param wsdl
         * @param method
         * @param namespace
         * @param headerParameters
         * @param bodyParameters
         * @param isBodyParametersNS
         * @return
         * @throws Exception
         */
        public static String invokeBySoap12(String wsdl, String method,
                String namespace, Map<String, String> headerParameters,
                Map<String, String> bodyParameters, boolean isBodyParametersNS)
                throws Exception {
            StringBuffer soapOfResult = null;
            // 去除 ?wsdl
            int length = wsdl.length();
            wsdl = wsdl.substring(0, length - 5);
            URL url = new URL(wsdl);
            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod("POST");
            conn.setDoInput(true);
            conn.setDoOutput(true);
            conn.setRequestProperty("Content-Type", "text/xml;charset=utf-8");
            OutputStream out = conn.getOutputStream();
            // 获取soap1.1版本消息
            StringBuilder sb = new StringBuilder();
            sb.append("<soap12:Envelope xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"
                    xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\" soap12:Envelope xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" ");
            sb.append("xmlns:ns0=\"" + namespace + "\"");
            sb.append(">");
            if (headerParameters != null) {
                sb.append("<soap12:Header>");
                for (Entry<String, String> headerParameter : headerParameters
                        .entrySet()) {
                    sb.append("<ns0:");
                    sb.append(headerParameter.getKey());
                    sb.append(">");
                    sb.append(headerParameter.getValue());
                    sb.append("</ns0:");
                    sb.append(headerParameter.getKey());
                    sb.append(">");
                }
                sb.append("</soap12:Header>");
            }
            sb.append("<soap12:Body><ns0:");
            sb.append(method);
            sb.append(">");
            // 输入参数
            if (bodyParameters != null) {
                for (Entry<String, String> inputParameter : bodyParameters
                        .entrySet()) {
                    if (isBodyParametersNS) {
                        sb.append("<ns0:");
                        sb.append(inputParameter.getKey());
                        sb.append(">");
                        sb.append(inputParameter.getValue());
                        sb.append("</ns0:");
                        sb.append(inputParameter.getKey());
                        sb.append(">");
                    } else {
                        sb.append("<");
                        sb.append(inputParameter.getKey());
                        sb.append(">");
                        sb.append(inputParameter.getValue());
                        sb.append("</");
                        sb.append(inputParameter.getKey());
                        sb.append(">");
                    }
                }
            }
            sb.append("</ns0:");
            sb.append(method);
            sb.append("></soap12:Body></soap12:Envelope>");
            System.out.println(sb.toString());
            out.write(sb.toString().getBytes());
            int code = conn.getResponseCode();
            if (code == 200) {
                InputStream is = conn.getInputStream();
                byte[] b = new byte[1024];
                int len = 0;
                soapOfResult = new StringBuffer();
                while ((len = is.read(b)) != -1) {
                    String s = new String(b, 0, len, "UTF-8");
                    soapOfResult.append(s);
                }
            }
            conn.disconnect();
            return soapOfResult == null ? null : soapOfResult.toString();
        }
     
        /**
         * 通过HTTP POST传参方式调用服务
         *
         * @param urlPath
         * @param method
         * @param namespace
         * @param inputParameters
         * @return
         * @throws Exception
         */
        public static String invokeByHTTPPOST(String urlPath, Map<String, String> inputParameters)
                throws Exception {
            StringBuffer resultStr = null;
            URL url = new URL(urlPath);
            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod("POST");
            conn.setDoInput(true);
            conn.setDoOutput(true);
            conn.setRequestProperty("Content-Type",
                    "application/x-www-form-urlencoded");
            StringBuilder sb = new StringBuilder();
            // 输入参数
            if (inputParameters != null) {
                for (Entry<String, String> inputParameter : inputParameters
                        .entrySet()) {
                    sb.append(inputParameter.getKey());
                    sb.append("=");
                    sb.append(inputParameter.getValue());
                    sb.append("&");
                }
                sb.deleteCharAt(sb.length() - 1);
            }
            System.out.println(sb.toString());
            OutputStream out = conn.getOutputStream();
            out.write(sb.toString().getBytes());
            int code = conn.getResponseCode();
            if (code == 200) {
                InputStream is = conn.getInputStream();
                byte[] b = new byte[1024];
                int len = 0;
                resultStr = new StringBuffer();
                while ((len = is.read(b)) != -1) {
                    String s = new String(b, 0, len, "UTF-8");
                    resultStr.append(s);
                }
            }
            conn.disconnect();
            return resultStr == null ? null : resultStr.toString();
        }
     
        /**
         * 通过HTTP GET传参方式调用服务
         *
         * @param urlPath   url路径
         * @param method    方法名
         * @param namespace 命名空间
         * @param inputParameters 输入参数
         * @return String
         * @throws Exception
         */
        public static String invokeByHTTPGET(String urlPath,  Map<String, String> inputParameters)
                throws Exception {
            StringBuilder sb = new StringBuilder();
            sb.append(urlPath);
            // GET参数
            if (inputParameters != null) {
                sb.append("?");
                //entrySet()方法    返回此映射中包含的映射关系的 set 视图集合
                //Map.Entry表示单个映射关系即一个key+value
                for (Entry<String, String> inputParameter : inputParameters
                        .entrySet()) {
                    sb.append(inputParameter.getKey());
                    sb.append("=");
                    sb.append(inputParameter.getValue());
                    sb.append("&");
                }
                //作用:去除最后一个拼接的'&'字符
                sb.deleteCharAt(sb.length() - 1);
            }
            System.out.println(sb.toString());
            URL url = new URL(sb.toString());
            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod("GET");
            conn.setDoOutput(true);
            int code = conn.getResponseCode();
            StringBuffer resultString = null;
            if (code == 200) {
                InputStream is = conn.getInputStream();
                byte[] b = new byte[4096];
                int len = 0;
                resultString = new StringBuffer();
                while ((len = is.read(b)) != -1) {
                    String s = new String(b, 0, len, "UTF-8");
                    //System.out.println(len+">>>>"+s);
                    resultString.append(s);
                }
            }
            conn.disconnect();
            return resultString == null ? null : resultString.toString();
        }

以上代码块涉及到的发送请求方式有通过SOAP1.1协议调用Web服务、通过SOAP1.2协议调用Web服务,通过HTTP POST传参方式调用服务和通过HTTP GET传参方式调用服务。其具体的请求方式在源代码中以注释方式以详细给出,故此处不再赘述。

在爬取过程中,我们还需要用到json在线校验工具,网址为:点击打开链接。主要利用此工具完成的操作为:验证json格式的正确性,根据json串生成相应的POJO类。如下图所示:

 

至此,网络爬虫的过程基本结束。

评论列表
发表评论
+ 关注