commit 788302b8501d3da48af58a98d6f068820d9d5525 Author: Lincong Date: Wed Apr 2 17:27:24 2025 +0800 first commit diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..0db1c58 --- /dev/null +++ b/pom.xml @@ -0,0 +1,113 @@ + + + 4.0.0 + + org.springframework.boot + spring-boot-starter-parent + 3.2.5 + + + tech.riemann + bidding + 1.0.0 + bidding + 招标信息爬虫 + + 17 + + + + org.springframework.boot + spring-boot-starter-web + + + org.springframework + spring-context + 6.1.6 + + + org.nutz + nutz-spring-boot-starter + 3.3.1 + + + com.ibeetl + beetl + 3.16.1.RELEASE + + + com.alibaba + druid-spring-boot-3-starter + 1.2.22 + + + com.oceanbase + oceanbase-client + 2.4.9 + + + club.zhcs + open-api-spring-boot-starter + 3.2.5 + + + org.jsoup + jsoup + 1.17.2 + + + org.springframework.boot + spring-boot-starter-test + test + + + org.springframework.boot + spring-boot-starter + + + org.projectlombok + lombok + true + + + com.mysql + mysql-connector-j + runtime + + + + + org.seleniumhq.selenium + selenium-java + 4.11.0 + + + + org.seleniumhq.selenium + selenium-chrome-driver + 4.11.0 + + + + + + ${project.artifactId} + + + org.springframework.boot + spring-boot-maven-plugin + + + + org.projectlombok + lombok + + + + + + + + diff --git a/src/main/java/tech/riemann/bidding/BiddingApplication.java b/src/main/java/tech/riemann/bidding/BiddingApplication.java new file mode 100644 index 0000000..e59a494 --- /dev/null +++ b/src/main/java/tech/riemann/bidding/BiddingApplication.java @@ -0,0 +1,26 @@ +package tech.riemann.bidding; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.context.annotation.Bean; +import org.springframework.scheduling.TaskScheduler; +import org.springframework.scheduling.annotation.EnableAsync; +import org.springframework.scheduling.annotation.EnableScheduling; +import org.springframework.scheduling.concurrent.ThreadPoolTaskScheduler; + +@SpringBootApplication +@EnableScheduling +@EnableAsync +public class BiddingApplication { + + public static void main(String[] args) { + SpringApplication.run(BiddingApplication.class, args); + } + + @Bean + TaskScheduler taskScheduler() { + ThreadPoolTaskScheduler taskScheduler = new ThreadPoolTaskScheduler(); + taskScheduler.setPoolSize(50); + return taskScheduler; + } +} diff --git a/src/main/java/tech/riemann/bidding/component/NoticeCollector.java b/src/main/java/tech/riemann/bidding/component/NoticeCollector.java new file mode 100644 index 0000000..352d607 --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/NoticeCollector.java @@ -0,0 +1,273 @@ +package tech.riemann.bidding.component; + +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.List; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.nutz.dao.Cnd; +import org.nutz.http.Header; +import org.nutz.http.Http; +import org.nutz.json.Json; +import org.nutz.lang.Lang; +import org.nutz.lang.util.NutMap; +import org.nutz.log.Logs; +import org.nutz.spring.boot.service.ExtService; +import org.springframework.beans.factory.InitializingBean; +import org.springframework.scheduling.annotation.Async; +import org.springframework.scheduling.annotation.Scheduled; + +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.entity.ScheduledLog; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/* + * 通用方法,非请勿动!!! + */ +public abstract class NoticeCollector implements InitializingBean { + /** + * 机器人地址 + */ + public static final String ROBOT_URL = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=7251bcc2-7158-4175-a5ef-bc529e432ee6"; + /** + * 机器人地址(三人群) + */ + public static final String ROBOT_URL_GX = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=cc1e8fe7-3244-4ffe-80ac-b1528b202a01"; + /** + * 关键词 + */ + public static final String KEY_WORDS = "人力外包、驻场、资源池、软件开发、技术服务、开发服务、人员外包、技术开发、现场、外协"; + /** + * 行业限定 + */ + public static final String INDUSTRY_KEY_WORDS = "证券、基金、保险、信托、资管、银行、养老金、中国、国家、烟草、电力、电信、移动"; + /** + * 重点关注 + */ + public static final String ATTENTION_KEY_WORDS = "资源池、外包"; + /** + * 黑名單 + */ + public static final String BLOCKED_KEY_WORDS = "单一、竞争、中标、结果、询价、公示、终止、成交、延期、磋商、失败"; + + public static final Pattern DATE_PATTERN = Pattern.compile("\\d{4}[-]\\d{2}[-]\\d{2}"); + public static final Pattern NUMBER_PATTERN = Pattern.compile("\\d+"); + /** + * 待发送队列 + */ + public static final LinkedBlockingQueue NOTICES_QUEUE = new LinkedBlockingQueue<>(); + + /** + * 提取日期 + * + * @param info + * @return + */ + public LocalDate date(String info) { + Matcher matcher = DATE_PATTERN.matcher(info); + if (matcher.find()) { + String date = matcher.group(); + return LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd")); + } + return LocalDate.now(); + } + + /** + * 提取数字 + * + * @param info + * @return + */ + public static long number(String info) { + Matcher matcher = NUMBER_PATTERN.matcher(info); + if (matcher.find()) { + String data = matcher.group(); + return Long.parseLong(data); + } + return 0; + } + + /** + * 采集渠道 + * + * @return + */ + public abstract Channel channel(); + + /** + * 采集方法,对应渠道自行实现 + */ + public abstract void collect(); + + protected static final NutMap DUPLICATE_STATUS = NutMap.NEW(); + + /** + * 开始采集,由采集渠道自行决定定时频率等 + */ + @Async + @Scheduled(cron = "0 */2 * * * ?") + public void start() { + DUPLICATE_STATUS.setv(channel().name(), 0);// 新一轮调度,重置当前渠道的重复数据 + Logs.get().debugf("渠道:%s 开始进行数据采集!", channel().getDescription()); + ScheduledLog log = startLog(); + collect(); + stopLog(log); + } + + /** + * 推送数据 + * + * @param notices + * @return + */ + public boolean pushNotices(List notices) { + return notices.stream().map(this::pushNotice).allMatch(item -> item); // 全部都OK,不然就终止 + } + + /** + * 推送数据 + * + * @param notice + * @return + */ + public boolean pushNotice(Notice notice) { + if (notice.getPublishDate().isBefore(LocalDate.now().minusDays(2))) { // 两天之前的数据,直接不要了 + return false; + } + if (noticeRepository().count(Cnd.where(Notice::getChannel, ExtService.EQ, notice.getChannel()) + .and(Notice::getKey, ExtService.EQ, notice.getKey())) == 0) { + // 没有,插入数据,发送通知 + noticeRepository().insert(notice); + sendMessage(notice); + return true; + } else { + int currentDuplicate = DUPLICATE_STATUS.getInt(channel().name()); + if (currentDuplicate >= 20) {// 超过3次重复,终止采集 + DUPLICATE_STATUS.setv(channel().name(), 0); + return false; + } + DUPLICATE_STATUS.setv(channel().name(), currentDuplicate + 1); // 重复的次数 + return true; + } + } + + /** + * 看门狗,单独线程扫描待发送渠道进行消息发送,需要渠道实现通过 + */ + @Override + public void afterPropertiesSet() { + String messageTemplate = """ + %s + >发布时间:%s + >来源:%s + >详情:[点击查看详情](%s) + """; + new Thread(() -> { + while (true) { + Notice notice = NOTICES_QUEUE.poll(); + if (notice != null) { + // 发送消息通知 + Http.post3(ROBOT_URL, + Json.toJson(NutMap.NEW().addv("msgtype", "markdown").addv("markdown", + NutMap.NEW().addv("content", + String.format(messageTemplate, notice.getTitle(), notice.getPublishDate(), + notice.getChannel().getDescription(), notice.getUrl())))), + Header.create().asJsonContentType(), 500); + Http.post3(ROBOT_URL_GX, + Json.toJson(NutMap.NEW().addv("msgtype", "markdown").addv("markdown", + NutMap.NEW().addv("content", + String.format(messageTemplate, notice.getTitle(), notice.getPublishDate(), + notice.getChannel().getDescription(), notice.getUrl())))), + Header.create().asJsonContentType(), 500); + if (attention(notice)) { + // 发送AT助理消息 + Http.post3(ROBOT_URL, + Json.toJson(NutMap.NEW().addv("msgtype", "text").addv("text", + NutMap.NEW().addv("mentioned_mobile_list", Lang.list("13811608471","13018059968","18343641000","18996359755","13673518683")) + .addv("content", "检测到【资源池、外包】项目,请立即确认!"+ notice.getTitle() +": "+ notice.getUrl()))), + Header.create().asJsonContentType(), 500); + } + } + try { + Thread.sleep(notice == null ? 100 : 5000); + } catch (InterruptedException e) { + throw Lang.wrapThrow(e); + } + } + }).start(); + } + + public boolean attention(Notice notice) { + List keyWords = Lang.list(ATTENTION_KEY_WORDS.split("、")); + return keyWords.stream().anyMatch(key -> notice.getTitle().contains(key)) + || keyWords.stream().anyMatch(key -> notice.getContent().contains(key)); + } + + /** + * 发送消息 + * + * @param notice + */ + public void sendMessage(Notice notice) { + /** + * 1. 关键词匹配放入队列
+ * 2. 队列数据单独线程定时消费 + */ + if (match(notice)) { + NOTICES_QUEUE.add(notice); + } + } + + /** + * 发送消息的匹配规则 + * + * @param notice 公告 + * @return 是否匹配 + */ + public boolean match(Notice notice) { + List blacks = Lang.list(BLOCKED_KEY_WORDS.split("、")); + List keyWords = Lang.list(KEY_WORDS.split("、")); + if (blacks.stream().noneMatch(key -> notice.getTitle().contains(key)) + && blacks.stream().noneMatch(key -> notice.getContent().contains(key))) { + return keyWords.stream().anyMatch(key -> notice.getTitle().contains(key)) + || keyWords.stream().anyMatch(key -> notice.getContent().contains(key)); + } + return false; + } + + public abstract ScheduledLogRepository scheduledLogRepository(); + + public abstract NoticeRepository noticeRepository(); + + /** + * + * 开始日志 + * + * @return + */ + public ScheduledLog startLog() { + return ScheduledLog.builder().channel(channel()).threadId(Thread.currentThread().getId()).build(); + } + + /** + * 停止日志 + * + * @param log + */ + public void stopLog(ScheduledLog log) { + log.setEnd(LocalDateTime.now()); + scheduledLogRepository().insert(log); + } + public static void main(String[] args) { + Http.post3(ROBOT_URL, + Json.toJson(NutMap.NEW().addv("msgtype", "text").addv("text", + NutMap.NEW().addv("mentioned_mobile_list", Lang.list("13811608471","13018059968","18343641000","18996359755","13673518683")) + .addv("content", "本条消息是测试,检测到【资源池、人力外包】项目,请立即确认!"))), + Header.create().asJsonContentType(), 500); + } +} diff --git a/src/main/java/tech/riemann/bidding/component/impl/BOCNoticeCollector.java b/src/main/java/tech/riemann/bidding/component/impl/BOCNoticeCollector.java new file mode 100644 index 0000000..b202219 --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/impl/BOCNoticeCollector.java @@ -0,0 +1,95 @@ +package tech.riemann.bidding.component.impl; + +import java.io.IOException; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.nutz.lang.Lang; +import org.nutz.log.Log; +import org.nutz.log.Logs; +import org.springframework.stereotype.Component; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.component.NoticeCollector; +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/** + * + */ +@Component +@RequiredArgsConstructor +public class BOCNoticeCollector extends NoticeCollector { + + private final NoticeRepository noticeRepository; + + private static final Log logger = Logs.get(); + private final ScheduledLogRepository scheduledLogRepository; + + @Override + public ScheduledLogRepository scheduledLogRepository() { + return scheduledLogRepository; + } + + @Override + public NoticeRepository noticeRepository() { + return noticeRepository; + } + + @Override + public Channel channel() { + return Channel.BOC; + } + + @Override + public void collect() { + for (int page = 1; page <= 20; page++) { + String indexPageName = page == 1 ? "" : "_" + page; + String url = String.format(channel().getUrl(), indexPageName); + logger.debugf("获取页面内容,Url:%s", url); + try { + Document document = Jsoup.connect(url).get(); + Elements elements = document.select(".news ul li"); + if (elements.isEmpty()) { + logger.debugf("没有内容,退出:%s", url); + break; + } + List notices = Lang.list(); + for (Element element : elements) { + String link = element.select("a").attr("href"); + Notice notice = Notice.builder() + .channel(channel()) + .key(link.replace("/","").replace("_", "").replace(".html", "").replace(".", "")) + .title(element.select("a").text()) + .content("") + .url("https://www.bankofchina.com/aboutboc/bi6"+link.substring(1)) + .publishDate(date(element.select("span").text().substring(2,12))) + .build(); + notices.add(notice); + } + if (!pushNotices(notices)) { + break; + } + } catch (IOException e) { + logger.debug(e); + break; + } + } + } + + @Override + public boolean match(Notice notice) { + List blacks = Lang.list(BLOCKED_KEY_WORDS.split("、")); + List keyWords = Lang.list(KEY_WORDS.split("、")); + if (blacks.stream().noneMatch(key -> notice.getTitle().contains(key)) + && blacks.stream().noneMatch(key -> notice.getContent().contains(key))) { + return keyWords.stream().anyMatch(key -> notice.getTitle().contains(key)); + } + return false; + } +} diff --git a/src/main/java/tech/riemann/bidding/component/impl/BOCQNoticeCollector.java b/src/main/java/tech/riemann/bidding/component/impl/BOCQNoticeCollector.java new file mode 100644 index 0000000..c6f547a --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/impl/BOCQNoticeCollector.java @@ -0,0 +1,133 @@ +package tech.riemann.bidding.component.impl; + +import java.io.IOException; +import java.time.LocalDate; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.nutz.lang.Lang; +import org.nutz.log.Log; +import org.nutz.log.Logs; +import org.springframework.stereotype.Component; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.component.NoticeCollector; +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/** + * + */ +@Component +@RequiredArgsConstructor +public class BOCQNoticeCollector extends NoticeCollector { + + private final NoticeRepository noticeRepository; + + private static final Log logger = Logs.get(); + private final ScheduledLogRepository scheduledLogRepository; + + @Override + public ScheduledLogRepository scheduledLogRepository() { + return scheduledLogRepository; + } + + @Override + public NoticeRepository noticeRepository() { + return noticeRepository; + } + + @Override + public Channel channel() { + return Channel.BOCQ; + } + + @Override + public void collect() { + for (int page = 1; page <= 20; page++) { + String indexPageName = page == 1 ? "" : "_" + page; + String url = String.format(channel().getUrl(), indexPageName); + logger.debugf("获取页面内容,Url:%s", url); + try { + Document document = Jsoup.connect(url).get(); + Elements elements = document.select(".dhy_b li"); + if (elements.isEmpty()) { + logger.debugf("没有内容,退出:%s", url); + break; + } + List notices = Lang.list(); + for (Element element : elements) { + String link = element.select("a").attr("href"); + Notice notice = Notice.builder() + .channel(channel()) + .key(link.replace("/","").replace("_", "").replace(".html", "").replace(".", "")) + .title(element.select("a").text()) + .content("") + .url("http://www.cqcbank.com.cn"+link) + .publishDate(date(element.select("span").text())) + .build(); + notices.add(notice); + } + if (!pushNotices(notices)) { + break; + } + } catch (IOException e) { + logger.debug(e); + break; + } + } + } + + @Override + public boolean match(Notice notice) { + List blacks = Lang.list(BLOCKED_KEY_WORDS.split("、")); + List keyWords = Lang.list(KEY_WORDS.split("、")); + if (blacks.stream().noneMatch(key -> notice.getTitle().contains(key)) + && blacks.stream().noneMatch(key -> notice.getContent().contains(key))) { + return keyWords.stream().anyMatch(key -> notice.getTitle().contains(key)); + } + return false; + } + + public static void main(String[] args) { + for (int page = 1; page <= 20; page++) { + String indexPageName = page == 1 ? "" : "_" + page; + String url = String.format(Channel.BOCQ.getUrl(), indexPageName); + System.out.println("获取页面内容,Url:"+url); + try { + Document document = Jsoup.connect(url).get(); + Elements elements = document.select(".dhy_b li"); + if (elements.isEmpty()) { + System.out.println("没有内容,退出:"+url); + break; + } + List notices = Lang.list(); + for (Element element : elements) { + System.out.println("element:" + element); + String link = element.select("a").attr("href"); + LocalDate pDate = LocalDate.parse(element.select("span").text()); + Notice notice = Notice.builder() + .channel(Channel.BOCQ) + .key(link.replace("/","").replace("_", "").replace(".html", "").replace(".", "")) + .title(element.select("a").text()) + .content("") + .url("http://www.cqcbank.com.cn"+link) + .publishDate(pDate) + .build(); + System.out.println("notice:\t"+notice.toString()); + notices.add(notice); + } + } catch (IOException e) { +// logger.debug(e); + System.out.println("IOException:" + e.getMessage()); + break; + } + }; +// }); + } +} diff --git a/src/main/java/tech/riemann/bidding/component/impl/BaoSteelNoticeCollector.java b/src/main/java/tech/riemann/bidding/component/impl/BaoSteelNoticeCollector.java new file mode 100644 index 0000000..228d565 --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/impl/BaoSteelNoticeCollector.java @@ -0,0 +1,92 @@ +package tech.riemann.bidding.component.impl; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.nutz.dao.Cnd; +import org.nutz.http.Header; +import org.nutz.http.Http; +import org.nutz.http.Response; +import org.nutz.lang.Strings; +import org.nutz.log.Logs; +import org.nutz.spring.boot.service.ExtService; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.component.NoticeCollector; +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/** + * 上海宝华国际渠道采集器 + */ +// @Component +@Deprecated +@RequiredArgsConstructor +public class BaoSteelNoticeCollector extends NoticeCollector { + + private final NoticeRepository noticeRepository; + private final ScheduledLogRepository scheduledLogRepository; + + @Override + public NoticeRepository noticeRepository() { + return noticeRepository; + } + + @Override + public ScheduledLogRepository scheduledLogRepository() { + return scheduledLogRepository; + } + + @Override + public Channel channel() { + return Channel.BAO_STEEL; + } + + @Override + public void collect() { + // 没有cookie会报521,目前cookie值写死,后续需要研究是不是在请求首页的时候自动下发了cookie信息 + Response response = Http.get(channel().getUrl(), + Header.create() + .set("Host", "baosteelbidding.zbytb.com") + .set("Referer", "https://baosteelbidding.zbytb.com/") + .set("Cookie", + "__jsluid_s=ac68c37cdc617959a40cef00227811e1; Du4_city=132%7Chttps%3A%2F%2Fbaosteelbidding.zbytb.com%2F; __jsl_clearance_s=1712113445.458|0|%2F%2BzxEoTkUCTMNxnyj9Nu2c1jodI%3D; Hm_lvt_47b9a4b804f6b4f81affae66cb8a57e9=1712023024,1712113449; Hm_lpvt_47b9a4b804f6b4f81affae66cb8a57e9=1712113475; Du4_vistor_st=3") + .set("User-Agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"), + 5000); + Logs.get().debugf("上海宝华国际渠道采集,网页访问状态码: %d", response.getStatus()); + if (response.isOK()) { + // dom 解析 + Document document = Jsoup.parse(response.getContent()); + // .li_dot + Elements elements = document.select("ul.li_dot li"); + Notice notice = Notice.builder().build(); + for (Element element : elements) { + if (Strings.equals(element.attr("class"), "kws")) { // 第二行 + notice.setPublishDate(date(element.text())); + if (noticeRepository.count(Cnd.where(Notice::getChannel, ExtService.EQ, notice.getChannel()) + .and(Notice::getKey, ExtService.EQ, notice.getKey())) == 0) { + // 没有,插入数据,发送通知 + noticeRepository.insert(notice); + sendMessage(notice); + } + } else { // 第一行 + Elements link = element.select("a"); + String url = link.first().attr("href"); + notice = Notice.builder() + .channel(channel()) + .key(number(url) + "") + .title(link.first().text()) + .content("") + .url(url) + .build(); + } + } + } + + } + +} diff --git a/src/main/java/tech/riemann/bidding/component/impl/CFCPNNoticeCollector.java b/src/main/java/tech/riemann/bidding/component/impl/CFCPNNoticeCollector.java new file mode 100644 index 0000000..51b4737 --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/impl/CFCPNNoticeCollector.java @@ -0,0 +1,135 @@ +package tech.riemann.bidding.component.impl; + +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.temporal.ChronoUnit; +import java.util.List; +import java.util.stream.Collectors; + +import org.nutz.http.Http; +import org.nutz.http.Response; +import org.nutz.lang.Lang; +import org.nutz.lang.util.NutMap; +import org.nutz.log.Logs; +import org.springframework.stereotype.Component; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.component.NoticeCollector; +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/** + * + */ +@Component +@RequiredArgsConstructor +public class CFCPNNoticeCollector extends NoticeCollector { + + private final NoticeRepository noticeRepository; + private final ScheduledLogRepository scheduledLogRepository; + + @Override + public ScheduledLogRepository scheduledLogRepository() { + return scheduledLogRepository; + } + + @Override + public NoticeRepository noticeRepository() { + return noticeRepository; + } + + @Override + public Channel channel() { + return Channel.CFCPN; + } + + @Override + public void collect() { + for (int page = 1;; page++) { + Response response = Http.post2(channel().getUrl(), NutMap.NEW().addv("noticeType", 1).addv("noticeState", 1) + .addv("pageNo", page).addv("isValid", 1).addv("orderBy", "publish_time desc"), 5000, 5000); + if (response.isOK()) { + List records = Lang.map(response.getContent()).getList("rows", NutMap.class); + if (records.isEmpty()) { + break; + } + List notices = records.stream().map(item -> { + String content = ""; + return Notice.builder().key(item.getString("id")).channel(channel()) + .title(item.getString("noticeTitle")).content(content) + .url(String.format( + "http://www.cfcpn.com/jcw/sys/index/goUrl?url=modules/sys/login/detail&column=undefined&searchVal=%s", + item.getString("id"))) + .publishDate(item.getAs("publishTime", LocalDateTime.class).toLocalDate()).build(); + }).collect(Collectors.toList()); + + if (!pushNotices(notices)) { + break; + } + try { + Thread.sleep(5000); + } catch (Exception e) { + Logs.get().debug(e); + break; + } + if (notices.stream() + .anyMatch(item -> item.getPublishDate().isBefore(LocalDate.now().minus(2, ChronoUnit.DAYS)))) {// 早于两天的数据了,不再翻页 + break; + } + } else { + Logs.get().debugf("请求发生错误,状态码为:%d", response.getStatus()); + break; + } + } + } + + public static void main(String[] args) { + for (int page = 1;; page++) { + Response response = Http.post2(Channel.CFCPN.getUrl(), + NutMap.NEW().addv("noticeType", 1).addv("noticeState", 1).addv("pageNo", page).addv("isValid", 1) + .addv("orderBy", "publish_time desc"), + 5000, 5000); + if (response.isOK()) { + List records = Lang.map(response.getContent()).getList("rows", NutMap.class); + if (records.isEmpty()) { + Logs.get().info("records is empty!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); + break; + } + List notices = records.stream().map(item -> { + String content = ""; + return Notice.builder().key(item.getString("id")).channel(Channel.CFCPN) + .title(item.getString("noticeTitle")).content(content) + .url(String.format( + "http://www.cfcpn.com/jcw/sys/index/goUrl?url=modules/sys/login/detail&column=undefined&searchVal=%s", + item.getString("id"))) + .publishDate(item.getAs("publishTime", LocalDateTime.class).toLocalDate()).build(); + }).collect(Collectors.toList()); + +// if (!pushNotices(notices)) { +// break; +// } + try { + Thread.sleep(5000); + } catch (Exception e) { + Logs.get().info("Exception happened:" + e.toString()); + Logs.get().debug(e); + break; + } + notices.stream().forEach(System.out::println); + System.out.println("page=" + page + "" + "now():" + LocalDate.now() + "now()-5days:" + + LocalDate.now().minus(5, ChronoUnit.DAYS)); + if (notices.stream() + .anyMatch(item -> item.getPublishDate().isBefore(LocalDate.now().minus(5, ChronoUnit.DAYS)))) {// 早于两天的数据了,不再翻页 + break; + } + } else { + Logs.get().info("请求发生错误,状态码为:%d" + response.getStatus()); + Logs.get().debugf("请求发生错误,状态码为:%d", response.getStatus()); + break; + } + } + } + +} diff --git a/src/main/java/tech/riemann/bidding/component/impl/CQRCBNoticeCollector.java b/src/main/java/tech/riemann/bidding/component/impl/CQRCBNoticeCollector.java new file mode 100644 index 0000000..0a292ad --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/impl/CQRCBNoticeCollector.java @@ -0,0 +1,150 @@ +package tech.riemann.bidding.component.impl; + +import java.io.IOException; +import java.time.LocalDate; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.nutz.lang.Lang; +import org.nutz.log.Log; +import org.nutz.log.Logs; +import org.springframework.stereotype.Component; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.component.NoticeCollector; +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/** + * + */ +@Component +@RequiredArgsConstructor +public class CQRCBNoticeCollector extends NoticeCollector { + + private final NoticeRepository noticeRepository; + + private static final Log logger = Logs.get(); + private final ScheduledLogRepository scheduledLogRepository; + + @Override + public ScheduledLogRepository scheduledLogRepository() { + return scheduledLogRepository; + } + + @Override + public NoticeRepository noticeRepository() { + return noticeRepository; + } + + @Override + public Channel channel() { + return Channel.CQRCB; + } + + @Override + public void collect() { + int totalPages = getTotalPages(); + totalPages = totalPages > 20 ? 20 : totalPages; + for (int page = 1; page <= totalPages; page++) { + String urlSuffixStr = page == 1 ? "index" : "index_" + page + ".html"; + String url = String.format(channel().getUrl(), urlSuffixStr); + logger.debugf("获取页面内容,Url:%s", url); + try { + Document document = Jsoup.connect(url).get(); + Elements elements = document.select(".sideCont ul li"); + if (elements.isEmpty()) { + logger.debugf("没有内容,退出:%s", url); + break; + } + List notices = Lang.list(); + for (Element element : elements) { + String link = element.select("a").attr("href"); + Notice notice = Notice.builder() + .channel(channel()) + .key(link.replace("/","").replace(".html", "")) + .title(element.select("a").text().replace("·","")) + .content("") + .url("https://www.cqrcb.com"+link) + .publishDate(date(element.select("span").text().substring(1,11))) + .build(); + notices.add(notice); + } + if (!pushNotices(notices)) { + break; + } + } catch (IOException e) { + logger.debug(e); + break; + } + } + } + + private int getTotalPages() { + String tempUrlStr = channel().getUrl() + "/index.html"; + int totalPages; + try { + Document document = Jsoup.connect(tempUrlStr).get(); + Elements elements = document.select(".pages em"); + String pageRelativeStr = elements.first().select("span").text(); + int lastSlashPos = pageRelativeStr.lastIndexOf("/"); + totalPages = Integer.parseInt(pageRelativeStr.substring(lastSlashPos+1)); + } catch (Exception e) { + // TODO: handle exception + logger.debug(e); + totalPages = 0; + } + return totalPages; + } + + @Override + public boolean match(Notice notice) { + List blacks = Lang.list(BLOCKED_KEY_WORDS.split("、")); + List keyWords = Lang.list(KEY_WORDS.split("、")); + if (blacks.stream().noneMatch(key -> notice.getTitle().contains(key)) + && blacks.stream().noneMatch(key -> notice.getContent().contains(key))) { + return keyWords.stream().anyMatch(key -> notice.getTitle().contains(key)); + } + return false; + } + + public static void main(String[] args) { + for (int page = 1; page <= 20; page++) { + String urlSuffixStr = page == 1 ? "index" : "index_" + page + ".html"; + String url = String.format(Channel.CQRCB.getUrl(), urlSuffixStr); + System.out.println("获取页面内容,Url:"+url); + try { + Document document = Jsoup.connect(url).get(); + Elements elements = document.select(".sideCont ul li"); + if (elements.isEmpty()) { + System.out.println("没有内容,退出:"+url); + break; + } + List notices = Lang.list(); + for (Element element : elements) { + String link = element.select("a").attr("href"); + LocalDate pDate = LocalDate.parse(element.select("span").text().substring(1,11)); + Notice notice = Notice.builder() + .channel(Channel.CQRCB) + .key(link.replace("/","").replace(".html", "")) + .title(element.select("a").text().replace("·","")) + .content("") + .url("https://www.cqrcb.com"+link) + .publishDate(pDate) + .build(); + System.out.println("notice:\t"+notice.toString()); + notices.add(notice); + } + } catch (IOException e) { + logger.debug(e); + break; + } + } +// }); + } +} diff --git a/src/main/java/tech/riemann/bidding/component/impl/ChinaCcsscmNoticeCollector.java b/src/main/java/tech/riemann/bidding/component/impl/ChinaCcsscmNoticeCollector.java new file mode 100644 index 0000000..c17968c --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/impl/ChinaCcsscmNoticeCollector.java @@ -0,0 +1,82 @@ +package tech.riemann.bidding.component.impl; + +import java.io.IOException; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.nutz.lang.Lang; +import org.nutz.log.Logs; +import org.springframework.stereotype.Component; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.component.NoticeCollector; +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/** + * + */ +@Component +@RequiredArgsConstructor +public class ChinaCcsscmNoticeCollector extends NoticeCollector { + + private final NoticeRepository noticeRepository; + + private final ScheduledLogRepository scheduledLogRepository; + + @Override + public ScheduledLogRepository scheduledLogRepository() { + return scheduledLogRepository; + } + + @Override + public NoticeRepository noticeRepository() { + return noticeRepository; + } + + @Override + public Channel channel() { + return Channel.CHINA_CCSSCM; + } + + @Override + public void collect() { + for (int page = 1;; page++) { + String url = String.format(channel().getUrl(), page); + try { + Document document = Jsoup.connect(url).get(); + Elements elements = document.select(".Top5 ul li"); + // 发起 + if (elements.isEmpty()) { + break; + } + List notices = Lang.list(); + for (Element element : elements) { + String link = element.select("a").attr("href"); + Notice notice = Notice.builder() + .channel(channel()) + .key(number(link) + "") + .title(element.select("a").text()) + .content("") + .url(link.startsWith("http") ? link : "https://zb.chinaccsscm.cn" + link) + .publishDate(date(element.select(".Right").text())) + .build(); + notices.add(notice); + } + if (!pushNotices(notices)) { + break; + } + } + catch (IOException e) { + Logs.get().debug(e); + break; + } + } + } + +} diff --git a/src/main/java/tech/riemann/bidding/component/impl/EBIDDINGNoticeCollector.java b/src/main/java/tech/riemann/bidding/component/impl/EBIDDINGNoticeCollector.java new file mode 100644 index 0000000..6f03a6c --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/impl/EBIDDINGNoticeCollector.java @@ -0,0 +1,88 @@ +package tech.riemann.bidding.component.impl; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.nutz.lang.Lang; +import org.nutz.log.Log; +import org.nutz.log.Logs; +import org.springframework.stereotype.Component; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.component.NoticeCollector; +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/** + * + */ +@Component +@RequiredArgsConstructor +public class EBIDDINGNoticeCollector extends NoticeCollector { + + private final NoticeRepository noticeRepository; + + private static final Log logger = Logs.get(); + private final ScheduledLogRepository scheduledLogRepository; + + @Override + public ScheduledLogRepository scheduledLogRepository() { + return scheduledLogRepository; + } + + @Override + public NoticeRepository noticeRepository() { + return noticeRepository; + } + + @Override + public Channel channel() { + return Channel.E_BIDDING; + } + + @Override + public void collect() { + Arrays.stream(KEY_WORDS.split("、")).forEach(keyword -> { + logger.debugf("爬取关键字,Url:%s", keyword); + for (int page = 1; page <= 20; page++) { + String url = String.format(channel().getUrl(), keyword, page); + logger.debugf("获取页面内容,Url:%s", url); + try { + Document document = Jsoup.connect(url).get(); + Elements elements = document.select(".newslist>li"); + if (elements.isEmpty()) { + logger.debugf("没有内容,退出:%s", url); + break; + } + List notices = Lang.list(); + for (Element element : elements) { + String link = element.select("a").attr("href"); + Notice notice = Notice.builder() + .channel(channel()) + .key(number(link.substring(link.lastIndexOf("/"))) + "") + .title(element.select("a").attr("title")) + .content("") + .url(link) + .publishDate(date(element.select(".newsDate div").text())) + .build(); + notices.add(notice); + } + if (!pushNotices(notices)) { + break; + } + } + catch (IOException e) { + logger.debug(e); + break; + } + } + }); + } +} diff --git a/src/main/java/tech/riemann/bidding/component/impl/GCZBNoticeCollector.java b/src/main/java/tech/riemann/bidding/component/impl/GCZBNoticeCollector.java new file mode 100644 index 0000000..6a34853 --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/impl/GCZBNoticeCollector.java @@ -0,0 +1,99 @@ +package tech.riemann.bidding.component.impl; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.nutz.lang.Lang; +import org.nutz.log.Log; +import org.nutz.log.Logs; +import org.springframework.stereotype.Component; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.component.NoticeCollector; +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/** + * + */ +@Component +@RequiredArgsConstructor +public class GCZBNoticeCollector extends NoticeCollector { + + private final NoticeRepository noticeRepository; + + private static final Log logger = Logs.get(); + private final ScheduledLogRepository scheduledLogRepository; + + @Override + public ScheduledLogRepository scheduledLogRepository() { + return scheduledLogRepository; + } + + @Override + public NoticeRepository noticeRepository() { + return noticeRepository; + } + + @Override + public Channel channel() { + return Channel.GCZB; + } + + @Override + public void collect() { + Arrays.stream(KEY_WORDS.split("、")).forEach(keyword -> { + logger.debugf("爬取关键字,Url:%s", keyword); + for (int page = 1; page <= 20; page++) { + String url = String.format(channel().getUrl(), page, keyword); + logger.debugf("获取页面内容,Url:%s", url); + try { + Document document = Jsoup.connect(url).get(); + Elements elements = document.select(".lists_center ul li"); + if (elements.isEmpty()) { + logger.debugf("没有内容,退出:%s", url); + break; + } + List notices = Lang.list(); + for (Element element : elements) { + String link = element.select("a").attr("href"); + Notice notice = Notice.builder() + .channel(channel()) + .key(number(link) + "") + .title(element.select("a").text()) + .content("") + .url(link) + .publishDate(date(element.select("b").text())) + .build(); + notices.add(notice); + } + if (!pushNotices(notices)) { + break; + } + } + catch (IOException e) { + logger.debug(e); + break; + } + } + }); + } + + @Override + public boolean match(Notice notice) { + List blacks = Lang.list(BLOCKED_KEY_WORDS.split("、")); + List keyWords = Lang.list(KEY_WORDS.split("、")); + List industrys = Lang.list(INDUSTRY_KEY_WORDS.split("、")); + if (blacks.stream().noneMatch(key -> notice.getTitle().contains(key)) && blacks.stream().noneMatch(key -> notice.getContent().contains(key))) { + return keyWords.stream().anyMatch(key -> notice.getTitle().contains(key)) && industrys.stream().anyMatch(key -> notice.getTitle().contains(key)); + } + return false; + } +} diff --git a/src/main/java/tech/riemann/bidding/component/impl/GXZBNoticeCollector.java b/src/main/java/tech/riemann/bidding/component/impl/GXZBNoticeCollector.java new file mode 100644 index 0000000..0976062 --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/impl/GXZBNoticeCollector.java @@ -0,0 +1,87 @@ +package tech.riemann.bidding.component.impl; + +import java.io.IOException; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.nutz.lang.Lang; +import org.nutz.log.Log; +import org.nutz.log.Logs; +import org.springframework.stereotype.Component; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.component.NoticeCollector; +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/** + * + */ +@Component +@RequiredArgsConstructor +public class GXZBNoticeCollector extends NoticeCollector { + + private final NoticeRepository noticeRepository; + private final ScheduledLogRepository scheduledLogRepository; + private static final Log logger = Logs.get(); + @Override + public ScheduledLogRepository scheduledLogRepository() { + return scheduledLogRepository; + } + + @Override + public NoticeRepository noticeRepository() { + return noticeRepository; + } + + @Override + public Channel channel() { + return Channel.GXZB; + } + + @Override + public void collect() { + for (int page = 1; page <= 20; page++) { + String url = String.format(channel().getUrl(), page); + logger.debugf("获取页面内容,Url:%s", url); + try { + Document document = Jsoup.connect(url).get(); + Elements elements = document.select(".newslist>li"); + if (elements.isEmpty()) { + logger.debugf("没有内容,退出:%s", url); + break; + } + List notices = Lang.list(); + for (Element element : elements) { + String link = element.select("a").attr("href"); + Notice notice = Notice.builder() + .channel(channel()) + .key(number(link.substring(link.lastIndexOf("/"))) + "") + .title(element.select("a").attr("title")) + .content("") + .url(link) + .publishDate(date(element.select(".newsDate div").text())) + .build(); + notices.add(notice); + } + if (!pushNotices(notices)) { + break; + } + } + catch (IOException e) { + logger.debug(e); + break; + } + } + } + public static void main(String[] args) { + String link = "https://ebid.gxzb.com.cn/biddingBulletin/2024-04-25/46236.html"; + String number = NoticeCollector.number(link.substring(link.lastIndexOf("/")))+""; + System.out.println(number); + } +} diff --git a/src/main/java/tech/riemann/bidding/component/impl/PICCECNoticeCollector.java b/src/main/java/tech/riemann/bidding/component/impl/PICCECNoticeCollector.java new file mode 100644 index 0000000..496ba7f --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/impl/PICCECNoticeCollector.java @@ -0,0 +1,111 @@ +package tech.riemann.bidding.component.impl; + +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.time.temporal.ChronoUnit; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.stream.Collectors; + +import org.nutz.http.Header; +import org.nutz.http.Http; +import org.nutz.http.Response; +import org.nutz.json.Json; +import org.nutz.lang.Lang; +import org.nutz.lang.util.NutMap; +import org.nutz.log.Log; +import org.nutz.log.Logs; +import org.springframework.stereotype.Component; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.component.NoticeCollector; +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/** + * + */ +@Component +@RequiredArgsConstructor +public class PICCECNoticeCollector extends NoticeCollector { + + private final NoticeRepository noticeRepository; + private final ScheduledLogRepository scheduledLogRepository; + private static final Log logger = Logs.get(); + @Override + public ScheduledLogRepository scheduledLogRepository() { + return scheduledLogRepository; + } + + @Override + public NoticeRepository noticeRepository() { + return noticeRepository; + } + + @Override + public Channel channel() { + return Channel.PICCEC; + } + + @Override + public void collect() { + //供应商征集 + //"siteId": "725", "categoryId": "211", "city": "", "county": "", "purchaseMode": "" + // + for (int page = 1;; page++) { + Response response = Http.post3(channel().getUrl(), + Json.toJson(NutMap.NEW() + .addv("dto", NutMap.NEW().addv("siteId", "725") + .addv("categoryId", "211,213,214,215,216,217") + .addv("city", "") + .addv("county", "") + .addv("purchaseMode", "") + ) + .addv("pageNo", page) + .addv("pageSize", 10)), + Header.create().asJsonContentType(), + 5000); + if (response.isOK()) { + List records = Lang.map(response.getContent()).getAs("res",NutMap.class).getList("rows", NutMap.class); + if (records.isEmpty()) { + break; + } + List notices = records.stream() + .map(item -> { + String content = ""; + String url=item.getString("url"); + logger.debugf("获取详情页面,Url:%s", url); + return Notice.builder() + .key(Arrays.stream(url.split("/")).map(NoticeCollector::number).filter(urlitem->urlitem != 0).map(urlitem->urlitem+"").collect(Collectors.joining("-"))) + .channel(channel()) + .title(item.getString("title")) + .content(content) + .url("https://ec.picc.com/cms/default/webfile"+url) + .publishDate( LocalDate.parse(item.getString("publishDate"), DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'+0800'", Locale.CHINESE))) + .build(); + }) + .collect(Collectors.toList()); + + if (!pushNotices(notices)) { + break; + } + try { + Thread.sleep(5000); + } + catch (Exception e) { + Logs.get().debug(e); + break; + } + if (notices.stream().anyMatch(item -> item.getPublishDate().isBefore(LocalDate.now().minus(2, ChronoUnit.DAYS)))) {// 早于两天的数据了,不再翻页 + break; + } + } else { + Logs.get().debugf("请求发生错误,状态码为:%d", response.getStatus()); + break; + } + } + } +} diff --git a/src/main/java/tech/riemann/bidding/component/impl/SWSCNoticeCollector.java b/src/main/java/tech/riemann/bidding/component/impl/SWSCNoticeCollector.java new file mode 100644 index 0000000..4033df3 --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/impl/SWSCNoticeCollector.java @@ -0,0 +1,136 @@ +package tech.riemann.bidding.component.impl; + +import java.io.IOException; +import java.time.LocalDate; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.nutz.lang.Lang; +import org.nutz.log.Log; +import org.nutz.log.Logs; +import org.springframework.stereotype.Component; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.component.NoticeCollector; +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/** + * + */ +@Component +@RequiredArgsConstructor +public class SWSCNoticeCollector extends NoticeCollector { + + private final NoticeRepository noticeRepository; + + private static final Log logger = Logs.get(); + private final ScheduledLogRepository scheduledLogRepository; + + @Override + public ScheduledLogRepository scheduledLogRepository() { + return scheduledLogRepository; + } + + @Override + public NoticeRepository noticeRepository() { + return noticeRepository; + } + + @Override + public Channel channel() { + return Channel.SWSC; + } + + @Override + public void collect() { + for (int page = 1; page <= 20; page++) { + String urlSuffixStr = page == 1 ? "" : "index_" + page + ".html"; + String url = String.format(channel().getUrl(), urlSuffixStr); + logger.debugf("获取页面内容,Url:%s", url); + try { + Document document = Jsoup.connect(url).get(); + Elements elements = document.select(".m-list ul li"); + if (elements.isEmpty()) { + logger.debugf("没有内容,退出:%s", url); + break; + } + List notices = Lang.list(); + for (Element element : elements) { + String link = element.select("a").attr("href"); + Notice notice = Notice.builder() + .channel(channel()) + .key(link.replace("/","").replace(".html", "")) + .title(element.select("a").attr("title")) + .content("") + .url("https://www.swsc.com.cn"+link) + .publishDate(date(element.select(".li-right").text())) + .build(); + notices.add(notice); + } + if (!pushNotices(notices)) { + break; + } + } catch (IOException e) { + logger.debug(e); + break; + } + } + } + + @Override + public boolean match(Notice notice) { + List blacks = Lang.list(BLOCKED_KEY_WORDS.split("、")); + List keyWords = Lang.list(KEY_WORDS.split("、")); + if (blacks.stream().noneMatch(key -> notice.getTitle().contains(key)) + && blacks.stream().noneMatch(key -> notice.getContent().contains(key))) { + return keyWords.stream().anyMatch(key -> notice.getTitle().contains(key)); + } + return false; + } + + public static void main(String[] args) { + for (int page = 1; page <= 20; page++) { + String urlSuffixStr = page == 1 ? "" : "index_" + page + ".html"; + String url = String.format(Channel.SWSC.getUrl(), urlSuffixStr); + System.out.println("获取页面内容,Url:"+url); + try { + Document document = Jsoup.connect(url).get(); + Elements elements = document.select(".m-list ul li"); + if (elements.isEmpty()) { + System.out.println("没有内容,退出:"+url); + break; + } + List notices = Lang.list(); + for (Element element : elements) { + System.out.println("element:" + element); + String link = element.select("a").attr("href"); + LocalDate pDate = LocalDate.parse(element.select(".li-right").text()); + Notice notice = Notice.builder() + .channel(Channel.SWSC) + .key(link.replace("/","").replace(".html", "")) + .title(element.select("a").attr("title")) + .content(element.select("a").text()) + .url("https://www.swsc.com.cn"+link) + .publishDate(pDate) + .build(); + System.out.println("notice:\t"+notice.toString()); + notices.add(notice); + } +// if (!pushNotices(notices)) { +// break; +// } + } catch (IOException e) { +// logger.debug(e); + System.out.println("IOException:" + e.getMessage()); + break; + } + }; +// }); + } +} diff --git a/src/main/java/tech/riemann/bidding/component/impl/SinoChemitcNoticeCollector.java b/src/main/java/tech/riemann/bidding/component/impl/SinoChemitcNoticeCollector.java new file mode 100644 index 0000000..64ef695 --- /dev/null +++ b/src/main/java/tech/riemann/bidding/component/impl/SinoChemitcNoticeCollector.java @@ -0,0 +1,114 @@ +package tech.riemann.bidding.component.impl; + +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.time.temporal.ChronoUnit; +import java.util.List; +import java.util.stream.Collectors; + +import org.nutz.http.Http; +import org.nutz.http.Response; +import org.nutz.lang.Lang; +import org.nutz.lang.util.NutMap; +import org.springframework.stereotype.Component; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.component.NoticeCollector; +import tech.riemann.bidding.entity.Notice; +import tech.riemann.bidding.entity.Notice.Channel; +import tech.riemann.bidding.repository.NoticeRepository; +import tech.riemann.bidding.repository.ScheduledLogRepository; + +/** + * 中化商务采集器 + */ +@Component +@RequiredArgsConstructor +public class SinoChemitcNoticeCollector extends NoticeCollector { + + private final NoticeRepository noticeRepository; + private final ScheduledLogRepository scheduledLogRepository; + + @Override + public ScheduledLogRepository scheduledLogRepository() { + return scheduledLogRepository; + } + + @Override + public NoticeRepository noticeRepository() { + return noticeRepository; + } + + @Getter + @AllArgsConstructor + public enum Type { + /** + * + */ + ZB_YS_BG(1, "招标/预审/变更"), + /** + * + */ + FZB(3, "非招标"), + /** + * + */ + PB_ZB_JG(2, "评标/中标结果"); + + int value; + String description; + } + + protected String url(int page, String start, String end) { + return String.format(channel().getUrl(), page, 50, Type.ZB_YS_BG.getValue(), start, end); + } + + @Override + public Channel channel() { + return Channel.SINOCHEMITC; + } + + @Override + public void collect() { + for (int page = 1;; page++) { + Response response = Http.get(url(page, LocalDate.now().minus(1, ChronoUnit.DAYS).format(DateTimeFormatter.ofPattern("yyyy-MM-dd")), "")); + if (response.isOK()) { + String content = response.getContent(); + List records = Lang.map(content).getAs("data", NutMap.class).getList("records", NutMap.class); + if (records.isEmpty()) { + break; + } + List notices = records.stream() + .map(d -> Notice.builder() + .channel(channel()) + .key(d.getString("id")) + .title(d.getString("title")) + .content(d.getString("content")) + .url(String + .format("https://d.sinochemitc.com/#/zcnotice/detale/xq?id=%s", d.getString("id"))) + .publishDate(d.getAs("publishDate", LocalDate.class)) + .build()) + .collect(Collectors.toList()); + + if (!pushNotices(notices)) { + break; + } + } else { + break; + } + } + } + + @Override + public boolean match(Notice notice) { + List blacks = Lang.list(BLOCKED_KEY_WORDS.split("、")); + List keyWords = Lang.list(KEY_WORDS.split("、")); + if (blacks.stream().noneMatch(key -> notice.getTitle().contains(key)) && blacks.stream().noneMatch(key -> notice.getContent().contains(key))) { + return keyWords.stream().anyMatch(key -> notice.getTitle().contains(key)); + } + return false; + } + +} diff --git a/src/main/java/tech/riemann/bidding/entity/IdBaseEntity.java b/src/main/java/tech/riemann/bidding/entity/IdBaseEntity.java new file mode 100644 index 0000000..84e6eff --- /dev/null +++ b/src/main/java/tech/riemann/bidding/entity/IdBaseEntity.java @@ -0,0 +1,49 @@ +package tech.riemann.bidding.entity; + +import java.time.LocalDateTime; + +import org.nutz.dao.entity.annotation.Column; +import org.nutz.dao.entity.annotation.Comment; +import org.nutz.spring.boot.service.entity.IdEntity; + +import com.fasterxml.jackson.annotation.JsonFormat; +import com.fasterxml.jackson.annotation.JsonFormat.Shape; + +import io.swagger.v3.oas.annotations.media.Schema; +import io.swagger.v3.oas.annotations.media.Schema.RequiredMode; +import lombok.AllArgsConstructor; +import lombok.Builder.Default; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.experimental.Accessors; +import lombok.experimental.SuperBuilder; + +@Data +@SuperBuilder +@NoArgsConstructor +@AllArgsConstructor +@Accessors(chain = true) +@EqualsAndHashCode(callSuper = true) +public class IdBaseEntity extends IdEntity { + + /** + * + */ + private static final long serialVersionUID = 1L; + + @Schema(description = "创建时间", requiredMode = RequiredMode.NOT_REQUIRED) + @Column("created_time") + @Comment("创建时间") + @Default + @JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss", locale = "GMT+8", shape = Shape.STRING) + protected LocalDateTime createdTime = LocalDateTime.now(); + + @Schema(description = "最后更新时间", requiredMode = RequiredMode.NOT_REQUIRED) + @Column("updated_time") + @Comment("最后更新时间") + @Default + @JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss", locale = "GMT+8", shape = Shape.STRING) + protected LocalDateTime updatedTime = LocalDateTime.now(); + +} diff --git a/src/main/java/tech/riemann/bidding/entity/Notice.java b/src/main/java/tech/riemann/bidding/entity/Notice.java new file mode 100644 index 0000000..6d1c5e7 --- /dev/null +++ b/src/main/java/tech/riemann/bidding/entity/Notice.java @@ -0,0 +1,190 @@ +package tech.riemann.bidding.entity; + +import java.time.LocalDate; + +import org.nutz.dao.entity.annotation.ColDefine; +import org.nutz.dao.entity.annotation.ColType; +import org.nutz.dao.entity.annotation.Column; +import org.nutz.dao.entity.annotation.Comment; +import org.nutz.dao.entity.annotation.Table; +import org.nutz.json.JsonField; +import org.nutz.lang.util.NutMap; + +import com.fasterxml.jackson.annotation.JsonGetter; + +import io.swagger.v3.oas.annotations.media.Schema; +import io.swagger.v3.oas.annotations.media.Schema.RequiredMode; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.experimental.Accessors; +import lombok.experimental.FieldNameConstants; +import lombok.experimental.SuperBuilder; + +@Data +@SuperBuilder +@NoArgsConstructor +@AllArgsConstructor +@FieldNameConstants +@EqualsAndHashCode(callSuper = true) +@Accessors(chain = true) +@Table("t_notice") +@Comment("招标公告") +@Schema(name = "Notice", description = "招标公告") +public class Notice extends IdBaseEntity { + + private static final long serialVersionUID = 1L; + + @Schema(description = "公告唯一标识(渠道内唯一)", requiredMode = RequiredMode.REQUIRED) + @Column("n_key") + @Comment("公告唯一标识(渠道内唯一)") + @ColDefine(notNull = true, width = 100) + String key; + + @Schema(description = "公告标题", requiredMode = RequiredMode.REQUIRED) + @Column("n_title") + @Comment("公告标题") + @ColDefine(notNull = true, width = 200) + String title; + + @Schema(description = "公告链接", requiredMode = RequiredMode.REQUIRED) + @Column("n_url") + @Comment("公告链接") + @ColDefine(notNull = true, width = 500) + String url; + + @Schema(description = "公告渠道", requiredMode = RequiredMode.REQUIRED) + @Column("n_channel") + @Comment("公告渠道") + @ColDefine(notNull = true, width = 50) + Channel channel; + + @Schema(description = "公告内容", requiredMode = RequiredMode.AUTO) + @Column("n_content") + @Comment("公告内容") + @ColDefine(notNull = false, type = ColType.TEXT) + String content; + + @Schema(description = "招标金额", requiredMode = RequiredMode.AUTO) + @Column("n_amount") + @Comment("招标金额") + @ColDefine(notNull = false) + double amount; + + @Schema(description = "发布时间", requiredMode = RequiredMode.AUTO) + @Column("n_publish_date") + @Comment("发布时间") + @ColDefine(notNull = false) + LocalDate publishDate; + + @Getter + @AllArgsConstructor + public enum Channel { + /** + * + */ + SINOCHEMITC("sinochemitc", "中化商务", + "https://d.sinochemitc.com/api/management/bidding/hy-bulletin-list?current=%d&size=%d&bulletinType=%d&bidType=&keyword=&startTime=%s&endTime=%s"), + /** + * + */ + CHINA_CCSSCM( + "chinaccsscm", "中通服总部", + "https://zb.chinaccsscm.cn/zbgg/index_%d.jhtml"), + /** + * + */ + CFCPN( + "cfcpn", "金采网", + "http://www.cfcpn.com/jcw/noticeinfo/noticeInfo/dataNoticeList"), + /** + * 1天内,北京、上海、重庆,只匹配标题 + */ + GCZB( + "gczb", "招标与采购网", + "https://www.gc-zb.com/search/index.html?page=%d&keyword=%s&h_lx=&h_province=19,43,47&vague=0&date=1&search_field=1"), + /** + * + */ + BOC( + "boc", "中国银行采购公告", + "https://www.bankofchina.com/aboutboc/bi6/index%s.html"), + /** + * + */ + PICCEC( + "piccec", "人保E采", + "https://ec.picc.com/cms/api/dynamicData/queryContentPage"), + /** + * + */ + GXZB( + "gxzb", "国信招标", + "https://ebid.gxzb.com.cn/cms/category/bulletinList.html?searchDate=1999-04-25&dates=300&word=&categoryId=88&exactSearch=&industryName=&status=&tabName=招标投标&page=%d"), + /** + * + */ + BAO_STEEL( + "baosteel", "上海宝华国际", + "https://baosteelbidding.zbytb.com/fuwu/"), + /** + * + */ + BOCQ( + "bocq", "重庆银行采购供应商征集公告", + "http://www.cqcbank.com.cn/cn/jrch/cgxx/hjjkh/ddfa/index%s.html"), + /** + * + */ + SWSC( + "swsc", "西南证券", + "https://www.swsc.com.cn/html/goSwsc/cgxxgg/cgxmgs/"), + /** + * + */ + CQRCB( + "cqrcb", "重庆农村商业银行", + "https://www.cqrcb.com/cqrcb/aboutus/cgxx"), + /** + * + */ + CEBPUBSERVICE( + "cebpubservice", "中国招标投标公共服务平台", + "http://www.cebpubservice.com/"), + /** + * + */ + E_BIDDING( + "ebidding", "国信e采", + "https://www.e-bidding.org/cms/category/bulletinList.html?searchDate=1999-06-21&dates=300&word=%s&categoryId=88&exactSearch=&industryName=&status=&tabName=招标投标&page=%d"), + + /** + * + */ + GTJA( + "gtja", "国泰君安", + "https://www.gtja.com/content/info-open/supplier/purchase-info.html?year=&keyword=%s"); + + String code; + String description; + String url; + } + + @JsonGetter + @JsonField + public NutMap getChannelInfo() { + return channel == null ? null + : NutMap.NEW() + .addv("name", channel.name()) + .addv("code", channel.getCode()) + .addv("description", + channel.getDescription()); + } + + public void setChannelInfo(NutMap typeInfo) { + // do nothing + } + +} diff --git a/src/main/java/tech/riemann/bidding/entity/ScheduledLog.java b/src/main/java/tech/riemann/bidding/entity/ScheduledLog.java new file mode 100644 index 0000000..ba83847 --- /dev/null +++ b/src/main/java/tech/riemann/bidding/entity/ScheduledLog.java @@ -0,0 +1,62 @@ +package tech.riemann.bidding.entity; + +import java.time.LocalDateTime; + +import org.nutz.dao.entity.annotation.ColDefine; +import org.nutz.dao.entity.annotation.Column; +import org.nutz.dao.entity.annotation.Comment; +import org.nutz.dao.entity.annotation.Table; + +import io.swagger.v3.oas.annotations.media.Schema; +import io.swagger.v3.oas.annotations.media.Schema.RequiredMode; +import lombok.AllArgsConstructor; +import lombok.Builder.Default; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.experimental.Accessors; +import lombok.experimental.FieldNameConstants; +import lombok.experimental.SuperBuilder; +import tech.riemann.bidding.entity.Notice.Channel; + +/** + * + */ +@Data +@SuperBuilder +@NoArgsConstructor +@AllArgsConstructor +@FieldNameConstants +@EqualsAndHashCode(callSuper = true) +@Accessors(chain = true) +@Table("t_scheduled_log") +@Comment("调度日志") +@Schema(name = "ScheduledLog", description = "调度日志") +public class ScheduledLog extends IdBaseEntity { + + private static final long serialVersionUID = 1L; + + @Schema(description = "公告渠道", requiredMode = RequiredMode.REQUIRED) + @Column("l_channel") + @Comment("公告渠道") + @ColDefine(notNull = true, width = 50) + Channel channel; + + @Schema(description = "开始时间", requiredMode = RequiredMode.REQUIRED) + @Column("l_start") + @Comment("开始时间") + @Default + LocalDateTime start = LocalDateTime.now(); + + @Schema(description = "结束时间", requiredMode = RequiredMode.REQUIRED) + @Column("l_end") + @Comment("结束时间") + @Default + LocalDateTime end = LocalDateTime.now(); + + @Schema(description = "线程id", requiredMode = RequiredMode.REQUIRED) + @Column("l_thread_id") + @Comment("线程id") + long threadId; + +} diff --git a/src/main/java/tech/riemann/bidding/repository/NoticeRepository.java b/src/main/java/tech/riemann/bidding/repository/NoticeRepository.java new file mode 100644 index 0000000..b492d59 --- /dev/null +++ b/src/main/java/tech/riemann/bidding/repository/NoticeRepository.java @@ -0,0 +1,21 @@ +package tech.riemann.bidding.repository; + +import org.nutz.dao.Dao; +import org.nutz.spring.boot.service.interfaces.IdEntityService; +import org.springframework.stereotype.Repository; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.entity.Notice; + +@Repository +@RequiredArgsConstructor +public class NoticeRepository implements IdEntityService { + + private final Dao dao; + + @Override + public Dao dao() { + return dao; + } + +} diff --git a/src/main/java/tech/riemann/bidding/repository/ScheduledLogRepository.java b/src/main/java/tech/riemann/bidding/repository/ScheduledLogRepository.java new file mode 100644 index 0000000..bd3edac --- /dev/null +++ b/src/main/java/tech/riemann/bidding/repository/ScheduledLogRepository.java @@ -0,0 +1,21 @@ +package tech.riemann.bidding.repository; + +import org.nutz.dao.Dao; +import org.nutz.spring.boot.service.interfaces.IdEntityService; +import org.springframework.stereotype.Repository; + +import lombok.RequiredArgsConstructor; +import tech.riemann.bidding.entity.ScheduledLog; + +@Repository +@RequiredArgsConstructor +public class ScheduledLogRepository implements IdEntityService { + + private final Dao dao; + + @Override + public Dao dao() { + return dao; + } + +} diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties new file mode 100644 index 0000000..d12779c --- /dev/null +++ b/src/main/resources/application.properties @@ -0,0 +1 @@ +spring.application.name=bidding \ No newline at end of file diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml new file mode 100644 index 0000000..6b49c96 --- /dev/null +++ b/src/main/resources/application.yml @@ -0,0 +1,62 @@ +spring: + application: + name: bidding + jackson: + date-format: yyyy-MM-dd HH:mm:ss + datasource: + type: com.alibaba.druid.pool.DruidDataSource + driver-class-name: com.mysql.cj.jdbc.Driver + url: jdbc:mysql://localhost:3306/bidding + username: root + password: 123456 + druid: + db-type: mysql + filters: stat,wall,log4j2 + initial-size: 10 + min-idle: 1 + max-active: 50 + max-wait: 60000 + time-between-eviction-runs-millis: 60000 + min-evictable-idle-time-millis: 300000 + validation-query: SELECT 'ezalor' + test-while-idle: true + test-on-borrow: true + test-on-return: false + pool-prepared-statements: true + max-pool-prepared-statement-per-connection-size: 20 + web-stat-filter: + enabled: true + url-pattern: /* + exclusions: /druid/*,*.js,*.gif,*.jpg,*.png,*.css,*.ico + stat-view-servlet: + enabled: true + url-pattern: /druid/* + reset-enable: true +nutz: + dao: + runtime: + basepackage: + - tech.riemann.bidding.entity + - BOOT-INF.classes.tech.riemann.bidding.entity +# - tech.riemann.bidding.component + check-index: true + create: true + delete-column: false + migration: true + sql-template: + enable: true + type: beetl + sql-manager: + paths: + - sqls/mysql + - BOOT-INF/classes/sqls/mysql +logging: + file: + name: ${user.home}/logs/${spring.application.name}.log + path: ${user.home}/logs + level: + "[org.nutz]": debug + "[tech.riemann]": debug + "[org.apache.logging]": off + springfox: off + "[io.swagger]": off \ No newline at end of file