业务逻辑修改以及完善

This commit is contained in:
2026-01-10 00:58:29 +08:00
parent 009d839dce
commit 6edf7a4958
6 changed files with 63 additions and 84 deletions

View File

@@ -229,19 +229,6 @@
<artifactId>openhtmltopdf-pdfbox</artifactId>
</dependency>
<!-- 可选:支持更复杂 CSS -->
<dependency>
<groupId>com.openhtmltopdf</groupId>
<artifactId>openhtmltopdf-slf4j</artifactId>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
</dependency>
</dependencies>
</project>

View File

@@ -13,10 +13,6 @@ import java.io.IOException;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
/**
* OkHttp第三方库读取网页HTML内容
*/
@@ -125,33 +121,4 @@ public class ReadHtmlByOkHttp {
return noMultipleSpace.trim();
}
public static void main(String[] args) {
// 1. 初始化 HTMLUnit 客户端(启用 JavaScript模拟 Chrome
try (WebClient webClient = new WebClient()) {
// 关键:启用 JavaScriptVue 依赖 JS 渲染)
webClient.getOptions().setJavaScriptEnabled(true);
// 禁用 CSS无需渲染样式提升速度
webClient.getOptions().setCssEnabled(false);
// 忽略 JS 错误(避免页面 JS 报错中断执行)
webClient.getOptions().setThrowExceptionOnScriptError(false);
// 设置超时时间
webClient.getOptions().setTimeout(15000);
// 2. 加载页面并等待 JS 渲染
String url = "https://vetti.hotake.cn/#/jobs/job/detail?jobId=126";
HtmlPage page = webClient.getPage(url);
// 等待 Vue 数据渲染(给足够时间执行 JS
webClient.waitForBackgroundJavaScript(5000);
// 3. 提取页面纯文本
String pageText = page.getTextContent();
System.out.println("=== HTMLUnit 提取的页面文本 ===");
System.out.println(pageText);
} catch (Exception e) {
System.out.println("提取失败:" + e.getMessage());
e.printStackTrace();
}
}
}