您现在的位置是:网站首页> 编程开发> java 编程开发

java-采集省市县街道四级联动源码

jeef2021-03-23java 1978人已围观

简介通过java-采集国家统计局四级联动程序,数据来源是 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html数据采集只为了学习讨论使用,请大家不要恶意做为非法用途。 闲话少说,直接贴代码。需要引hutool的pom文件,可以自己去找最新版<dependency> <groupId>cn.hu

java-采集省市县街道四级联动源码

最后更新:2021-03-23 16:20:18

推荐指数

通过java-采集国家统计局四级联动程序,数据来源是 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html

数据采集只为了学习讨论使用,请大家不要恶意做为非法用途。 闲话少说,直接贴代码。


需要引hutool的pom文件,可以自己去找最新版

<dependency>
	<groupId>cn.hutool</groupId>
	<artifactId>hutool-all</artifactId>
	<version>5.5.5</version>
</dependency>


package com.pss.mall.admin.controller;

import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONArray;
import com.pss.mall.admin.WebApplication;
import com.pss.mall.common.util.SpringUtil;
import com.pss.mall.entity.model.AreaNew;
import com.pss.mall.service.AreaNewService;
import com.pss.mall.service.impl.AreaNewServiceImpl;
import org.springframework.boot.SpringApplication;
import org.springframework.context.ApplicationContext;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @Description:地址采集
 * @Author: Jeff
 * @Date: 2021/3/22 13:50
 */
public class test {

    public static void main(String[] args) {
        SpringApplication.run(WebApplication.class, args);
        String strurl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html";//目标URL
        try {
            getP(strurl);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    /**
     * 读取一个网页全部内容
     */
    public static String getOneHtml(String htmlurl) {
        Map<String, Object> headers = new HashMap<>();
        headers.put("Cookie", "wzws_cid=d21de43cf846b12ee9804e34afdbb29987840ab591e7ee9c7b0c63342ee8da64554032b01e999718fe7a51320bce6eee6998194cc56c072746f8d5e8ee6beb6a41b1f21fae7b02dfc6830409f5e2f669f2eaa5b0b7a523dd4917aaee4ddeca99; path=/; HttpOnly; expires=Tue, 23 Mar 2021 04:14:07 GMT");
        headers.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0");
        headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
        String str = HttpUtil.post(htmlurl, headers, -1);
        System.out.println(str);
        return str;
    }

    /**
     * @param s
     * @return 获得网页省
     */
    public static void getP(String s) {
        String str = getOneHtml(s);
        String regex1;
        String regex2;
        String regex3;
        String regex4;

        regex1 = "<a href='(.*?)'>(.*?)<br/></a>";//省
        regex2 = "<tr class='citytr'><td><a href='(.*?)'>(.*?)</a></td><td><a href='(.*?).html'>(.*?)</a></td></tr>";//市
        regex3 = "<tr class='countytr'><td><a href='(.*?)'>(.*?)</a></td><td><a href='(.*?)'>(.*?)</a></td></tr>";//区
        regex4 = "<a href='(.*?)'>(.*?)</a></td><td><a href='(.*?)'>(.*?)</a>";//街道
        Pattern pa1 = Pattern.compile(regex1, Pattern.CANON_EQ);
        Pattern pa2 = Pattern.compile(regex2, Pattern.CANON_EQ);
        Pattern pa3 = Pattern.compile(regex3, Pattern.CANON_EQ);
        Pattern pa4 = Pattern.compile(regex4, Pattern.CANON_EQ);
        Matcher ma = pa1.matcher(str);

        while (ma.find()) {
            List<AreaNew> eList = new ArrayList<>();
            String code = ma.group(1).replaceAll(".html", "");
            AreaNew province = new AreaNew();
            province.setAreaId(Long.valueOf(code + "0000000000"));
            province.setAreaName(ma.group(2));
            province.setLevel(1);
            province.setParentId(0L);
            eList.add(province);
            String cityUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/" + ma.group(1);//市URL
            String cityStr = getOneHtml(cityUrl);
//            Thread.sleep(2000);
            Matcher cityMa = pa2.matcher(cityStr);
            while (cityMa.find()) {
                AreaNew city = new AreaNew();
                city.setLevel(2);
                city.setAreaName(cityMa.group(4));
                city.setAreaId(Long.valueOf(cityMa.group(2)));
                city.setParentId(province.getAreaId());
                eList.add(city);
                String areaUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/" + cityMa.group(1);//区URL
                String areaStr = getOneHtml(areaUrl);
//                Thread.sleep(2000);
                Matcher areaMa = pa3.matcher(areaStr);
                while (areaMa.find()) {
                    AreaNew area = new AreaNew();
                    area.setLevel(3);
                    area.setAreaName(areaMa.group(4));
                    area.setAreaId(Long.valueOf(areaMa.group(2)));
                    area.setParentId(city.getAreaId());
                    eList.add(area);
                    String towntrUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/" + code + "/" + areaMa.group(1);//街道URL
                    String towntrStr = getOneHtml(towntrUrl);
//                Thread.sleep(2000);
                    Matcher towntrMa = pa4.matcher(towntrStr);
                    while (towntrMa.find()) {
                        AreaNew towntr = new AreaNew();
                        towntr.setLevel(4);
                        towntr.setAreaName(towntrMa.group(4));
                        towntr.setAreaId(Long.valueOf(towntrMa.group(2)));
                        towntr.setParentId(area.getAreaId());
                        eList.add(towntr);
                    }
                }
            }
            JSONArray eJson = new JSONArray(eList);
            System.out.println(eJson);
            if (eList.size() > 0) {
                ApplicationContext context = SpringUtil.getApplicationContext();
                AreaNewService areaNewService = context.getBean(AreaNewServiceImpl.class);
                areaNewService.saveBatch(eList);
            }
        }
    }

}

package com.pss.mall.common.util;

import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Component;

/**
 * @Description
 * @Author: Jeff
 * @Date: 2021/3/23 10:45
 */

@Component
public class SpringUtil implements ApplicationContextAware {

    private static ApplicationContext applicationContext = null;

    @Override
    public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
        if (SpringUtil.applicationContext == null) {
            SpringUtil.applicationContext = applicationContext;
        }
    }

    //获取applicationContext
    public static ApplicationContext getApplicationContext() {
        return applicationContext;
    }

    //通过name获取 Bean.
    public static Object getBean(String name) {
        return getApplicationContext().getBean(name);
    }

    //通过class获取Bean.
    public static <T> T getBean(Class<T> clazz) {
        return getApplicationContext().getBean(clazz);
    }

    //通过name,以及Clazz返回指定的Bean
    public static <T> T getBean(String name, Class<T> clazz) {
        return getApplicationContext().getBean(name, clazz);
    }

}
package com.pss.mall.entity.model;

import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;

import java.io.Serializable;

/**
 * @Description ${DESCRIPTION}
 * @Author: Jeff
 * @Date: 2021/3/23 11:11
 */
@Data
@TableName(value = "tz_area_new")
public class AreaNew implements Serializable {
    @TableField(value = "area_id")
    private Long areaId;

    @TableField(value = "area_name")
    private String areaName;

    @TableField(value = "parent_id")
    private Long parentId;

    @TableField(value = "level")
    private Integer level;

    private static final long serialVersionUID = 1L;
}


很赞哦! (125)

文章评论

来说两句吧...

验证码: