您现在的位置是:网站首页> 编程开发> java 编程开发
java-采集省市县街道四级联动源码
jeef2021-03-23【java】
2539人已围观
简介通过java-采集国家统计局四级联动程序,数据来源是 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html数据采集只为了学习讨论使用,请大家不要恶意做为非法用途。 闲话少说,直接贴代码。需要引hutool的pom文件,可以自己去找最新版<dependency> <groupId>cn.hu

java-采集省市县街道四级联动源码
最后更新:2021-03-23 16:20:18
推荐指数:
通过java-采集国家统计局四级联动程序,数据来源是 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html
数据采集只为了学习讨论使用,请大家不要恶意做为非法用途。 闲话少说,直接贴代码。
需要引hutool的pom文件,可以自己去找最新版
<dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>5.5.5</version> </dependency> package com.pss.mall.admin.controller; import cn.hutool.http.HttpUtil; import cn.hutool.json.JSONArray; import com.pss.mall.admin.WebApplication; import com.pss.mall.common.util.SpringUtil; import com.pss.mall.entity.model.AreaNew; import com.pss.mall.service.AreaNewService; import com.pss.mall.service.impl.AreaNewServiceImpl; import org.springframework.boot.SpringApplication; import org.springframework.context.ApplicationContext; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @Description:地址采集 * @Author: Jeff * @Date: 2021/3/22 13:50 */ public class test { public static void main(String[] args) { SpringApplication.run(WebApplication.class, args); String strurl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html";//目标URL try { getP(strurl); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 读取一个网页全部内容 */ public static String getOneHtml(String htmlurl) { Map<String, Object> headers = new HashMap<>(); headers.put("Cookie", "wzws_cid=d21de43cf846b12ee9804e34afdbb29987840ab591e7ee9c7b0c63342ee8da64554032b01e999718fe7a51320bce6eee6998194cc56c072746f8d5e8ee6beb6a41b1f21fae7b02dfc6830409f5e2f669f2eaa5b0b7a523dd4917aaee4ddeca99; path=/; HttpOnly; expires=Tue, 23 Mar 2021 04:14:07 GMT"); headers.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"); headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); String str = HttpUtil.post(htmlurl, headers, -1); System.out.println(str); return str; } /** * @param s * @return 获得网页省 */ public static void getP(String s) { String str = getOneHtml(s); String regex1; String regex2; String regex3; String regex4; regex1 = "<a href='(.*?)'>(.*?)<br/></a>";//省 regex2 = "<tr class='citytr'><td><a href='(.*?)'>(.*?)</a></td><td><a href='(.*?).html'>(.*?)</a></td></tr>";//市 regex3 = "<tr class='countytr'><td><a href='(.*?)'>(.*?)</a></td><td><a href='(.*?)'>(.*?)</a></td></tr>";//区 regex4 = "<a href='(.*?)'>(.*?)</a></td><td><a href='(.*?)'>(.*?)</a>";//街道 Pattern pa1 = Pattern.compile(regex1, Pattern.CANON_EQ); Pattern pa2 = Pattern.compile(regex2, Pattern.CANON_EQ); Pattern pa3 = Pattern.compile(regex3, Pattern.CANON_EQ); Pattern pa4 = Pattern.compile(regex4, Pattern.CANON_EQ); Matcher ma = pa1.matcher(str); while (ma.find()) { List<AreaNew> eList = new ArrayList<>(); String code = ma.group(1).replaceAll(".html", ""); AreaNew province = new AreaNew(); province.setAreaId(Long.valueOf(code + "0000000000")); province.setAreaName(ma.group(2)); province.setLevel(1); province.setParentId(0L); eList.add(province); String cityUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/" + ma.group(1);//市URL String cityStr = getOneHtml(cityUrl); // Thread.sleep(2000); Matcher cityMa = pa2.matcher(cityStr); while (cityMa.find()) { AreaNew city = new AreaNew(); city.setLevel(2); city.setAreaName(cityMa.group(4)); city.setAreaId(Long.valueOf(cityMa.group(2))); city.setParentId(province.getAreaId()); eList.add(city); String areaUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/" + cityMa.group(1);//区URL String areaStr = getOneHtml(areaUrl); // Thread.sleep(2000); Matcher areaMa = pa3.matcher(areaStr); while (areaMa.find()) { AreaNew area = new AreaNew(); area.setLevel(3); area.setAreaName(areaMa.group(4)); area.setAreaId(Long.valueOf(areaMa.group(2))); area.setParentId(city.getAreaId()); eList.add(area); String towntrUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/" + code + "/" + areaMa.group(1);//街道URL String towntrStr = getOneHtml(towntrUrl); // Thread.sleep(2000); Matcher towntrMa = pa4.matcher(towntrStr); while (towntrMa.find()) { AreaNew towntr = new AreaNew(); towntr.setLevel(4); towntr.setAreaName(towntrMa.group(4)); towntr.setAreaId(Long.valueOf(towntrMa.group(2))); towntr.setParentId(area.getAreaId()); eList.add(towntr); } } } JSONArray eJson = new JSONArray(eList); System.out.println(eJson); if (eList.size() > 0) { ApplicationContext context = SpringUtil.getApplicationContext(); AreaNewService areaNewService = context.getBean(AreaNewServiceImpl.class); areaNewService.saveBatch(eList); } } } } package com.pss.mall.common.util; import org.springframework.beans.BeansException; import org.springframework.context.ApplicationContext; import org.springframework.context.ApplicationContextAware; import org.springframework.stereotype.Component; /** * @Description * @Author: Jeff * @Date: 2021/3/23 10:45 */ @Component public class SpringUtil implements ApplicationContextAware { private static ApplicationContext applicationContext = null; @Override public void setApplicationContext(ApplicationContext applicationContext) throws BeansException { if (SpringUtil.applicationContext == null) { SpringUtil.applicationContext = applicationContext; } } //获取applicationContext public static ApplicationContext getApplicationContext() { return applicationContext; } //通过name获取 Bean. public static Object getBean(String name) { return getApplicationContext().getBean(name); } //通过class获取Bean. public static <T> T getBean(Class<T> clazz) { return getApplicationContext().getBean(clazz); } //通过name,以及Clazz返回指定的Bean public static <T> T getBean(String name, Class<T> clazz) { return getApplicationContext().getBean(name, clazz); } } package com.pss.mall.entity.model; import com.baomidou.mybatisplus.annotation.TableField; import com.baomidou.mybatisplus.annotation.TableName; import lombok.Data; import java.io.Serializable; /** * @Description ${DESCRIPTION} * @Author: Jeff * @Date: 2021/3/23 11:11 */ @Data @TableName(value = "tz_area_new") public class AreaNew implements Serializable { @TableField(value = "area_id") private Long areaId; @TableField(value = "area_name") private String areaName; @TableField(value = "parent_id") private Long parentId; @TableField(value = "level") private Integer level; private static final long serialVersionUID = 1L; }
很赞哦! (125)
下一篇:Linux服务器上Jdk的安装
文章评论
验证码: