简历读取基础逻辑添加以及用户语音配置信息字段添加
This commit is contained in:
@@ -184,6 +184,31 @@
|
||||
<artifactId>twilio</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>net.sourceforge.tess4j</groupId>
|
||||
<artifactId>tess4j</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Apache POI HWPF -->
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-scratchpad</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Maven -->
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
@@ -127,6 +127,9 @@ public class SysUser extends BaseEntity
|
||||
@ApiModelProperty("用户标识(1:新用户,2:老用户)")
|
||||
private String userFlag;
|
||||
|
||||
@ApiModelProperty("用户语音配置信息")
|
||||
private String userSetJson;
|
||||
|
||||
/** 部门对象 */
|
||||
@Excels({
|
||||
@Excel(name = "部门名称", targetAttr = "deptName", type = Type.EXPORT),
|
||||
@@ -462,6 +465,14 @@ public class SysUser extends BaseEntity
|
||||
this.userFlag = userFlag;
|
||||
}
|
||||
|
||||
public String getUserSetJson() {
|
||||
return userSetJson;
|
||||
}
|
||||
|
||||
public void setUserSetJson(String userSetJson) {
|
||||
this.userSetJson = userSetJson;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new ToStringBuilder(this,ToStringStyle.MULTI_LINE_STYLE)
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
package com.vetti.common.utils.readFile;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.ImageType;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import net.sourceforge.tess4j.ITesseract;
|
||||
import net.sourceforge.tess4j.Tesseract;
|
||||
import net.sourceforge.tess4j.TesseractException;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
|
||||
public class FileContentUtil {
|
||||
|
||||
private FileContentUtil() {}
|
||||
|
||||
/**
|
||||
* 读取不同类型文件的文本内容。
|
||||
*
|
||||
* @param is 输入流(由调用方负责关闭)
|
||||
* @param fileExtension 文件扩展名(小写,例如:txt、pdf、docx、doc、html)
|
||||
* @return 提取到的文本
|
||||
* @throws IOException IO 相关异常
|
||||
*/
|
||||
public static String readFileContent(InputStream is, String fileExtension) throws IOException {
|
||||
return readFileContent(is, fileExtension, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* 读取不同类型文件的文本内容,支持在 PDF 文本为空时进行 OCR。
|
||||
*
|
||||
* @param is 输入流(由调用方负责关闭)
|
||||
* @param fileExtension 文件扩展名(小写,例如:txt、pdf、docx、doc、html)
|
||||
* @param tesseractDatapath Tesseract 数据路径(可选;为空则不设置)
|
||||
* @return 提取到的文本
|
||||
* @throws IOException IO 相关异常
|
||||
*/
|
||||
public static String readFileContent(InputStream is, String fileExtension, String tesseractDatapath) throws IOException {
|
||||
Objects.requireNonNull(is, "InputStream cannot be null");
|
||||
Objects.requireNonNull(fileExtension, "fileExtension cannot be null");
|
||||
|
||||
switch (fileExtension) {
|
||||
case "txt": {
|
||||
byte[] bytes = toByteArray(is);
|
||||
return new String(bytes, StandardCharsets.UTF_8);
|
||||
}
|
||||
case "pdf": {
|
||||
try (PDDocument doc = PDDocument.load(is)) {
|
||||
PDFTextStripper textStripper = new PDFTextStripper();
|
||||
String str = textStripper.getText(doc);
|
||||
// str = str.replace("\r", "").replace("\n", "");
|
||||
if (StringUtils.isEmpty(str)) {
|
||||
int pageCount = doc.getNumberOfPages();
|
||||
if (pageCount > 0) {
|
||||
PDFRenderer renderer = new PDFRenderer(doc);
|
||||
ITesseract tesseract = new Tesseract();
|
||||
if (tesseractDatapath != null) {
|
||||
tesseract.setDatapath(tesseractDatapath);
|
||||
}
|
||||
tesseract.setLanguage("eng+chi_sim");
|
||||
StringBuilder fullText = new StringBuilder();
|
||||
for (int i = 0; i < pageCount; i++) {
|
||||
BufferedImage image = renderer.renderImageWithDPI(i, 300, ImageType.BINARY);
|
||||
try {
|
||||
String pageText = tesseract.doOCR(image);
|
||||
fullText.append(pageText).append("\n\n");
|
||||
} catch (TesseractException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
str = fullText.toString();
|
||||
}
|
||||
}
|
||||
return str;
|
||||
}
|
||||
}
|
||||
case "docx": {
|
||||
try (XWPFDocument xdoc = new XWPFDocument(is);
|
||||
XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc)) {
|
||||
return extractor.getText();
|
||||
}
|
||||
}
|
||||
case "doc": {
|
||||
try (WordExtractor extractor = new WordExtractor(is)) {
|
||||
return extractor.getText();
|
||||
}
|
||||
}
|
||||
case "html": {
|
||||
// 直接从 InputStream 解析 HTML,避免中间落地文件
|
||||
Document doc = Jsoup.parse(is, "UTF-8", "");
|
||||
return doc.body() != null ? doc.body().html() : "";
|
||||
}
|
||||
default:
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
private static byte[] toByteArray(InputStream is) throws IOException {
|
||||
ByteArrayOutputStream bos = new ByteArrayOutputStream(Math.max(32, is.available()));
|
||||
byte[] buf = new byte[8192];
|
||||
int len;
|
||||
while ((len = is.read(buf)) != -1) {
|
||||
bos.write(buf, 0, len);
|
||||
}
|
||||
return bos.toByteArray();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,598 @@
|
||||
package com.vetti.common.utils.readText;
|
||||
|
||||
import com.vetti.common.utils.readText.vo.Education;
|
||||
import com.vetti.common.utils.readText.vo.PersonalInfo;
|
||||
import com.vetti.common.utils.readText.vo.ResumeData;
|
||||
import com.vetti.common.utils.readText.vo.WorkExperience;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.time.Year;
|
||||
|
||||
/**
|
||||
* 简历文本特征提取器
|
||||
*
|
||||
* 功能:从PDF/DOCX解析出的原始文本中提取结构化的简历信息
|
||||
*
|
||||
* 主要提取内容:
|
||||
* - 个人信息(姓名、工作年限、证书)
|
||||
* - 工作经历(公司、职位、职责、项目)
|
||||
* - 技能列表(基于角色的相关技能)
|
||||
* - 教育背景(学历、专业、毕业年份)
|
||||
*
|
||||
* 使用场景:
|
||||
* 1. 简历预处理 - 将非结构化文本转换为结构化数据
|
||||
* 2. 特征工程 - 为后续的AI评估提供标准化输入
|
||||
* 3. 数据清洗 - 过滤和规范化提取的信息
|
||||
*/
|
||||
public class ResumeTextExtractor {
|
||||
|
||||
/**
|
||||
* 提取候选人姓名
|
||||
*
|
||||
* 策略:
|
||||
* 1. 扫描简历文本的前5行(姓名通常在顶部)
|
||||
* 2. 使用启发式规则识别可能的姓名
|
||||
* 3. 过滤掉常见的标题词(如"Resume", "CV"等)
|
||||
*
|
||||
* @param text 简历的原始文本内容
|
||||
* @return 提取的姓名,如果未找到则返回'Unknown'
|
||||
*/
|
||||
public String extractName(String text) {
|
||||
// 按行分割文本,过滤空行
|
||||
String[] lines = text.split("\n");
|
||||
List<String> nonEmptyLines = new ArrayList<>();
|
||||
for (String line : lines) {
|
||||
if (!line.trim().isEmpty()) {
|
||||
nonEmptyLines.add(line.trim());
|
||||
}
|
||||
}
|
||||
|
||||
// 遍历前5行寻找姓名(姓名通常在简历顶部)
|
||||
int limit = Math.min(5, nonEmptyLines.size());
|
||||
for (int i = 0; i < limit; i++) {
|
||||
String line = nonEmptyLines.get(i);
|
||||
if (isLikelyName(line)) {
|
||||
return cleanName(line);
|
||||
}
|
||||
}
|
||||
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断文本是否可能是姓名
|
||||
*
|
||||
* 启发式规则:
|
||||
* 1. 不包含简历相关的关键词
|
||||
* 2. 单词数量在2-4个之间(名字+姓氏的合理范围)
|
||||
* 3. 总长度小于50个字符
|
||||
*
|
||||
* @param text 待判断的文本行
|
||||
* @return 是否可能是姓名
|
||||
*/
|
||||
private boolean isLikelyName(String text) {
|
||||
// 排除常见的简历标题词
|
||||
String[] excludeWords = {"resume", "cv", "curriculum", "vitae", "profile", "summary", "objective"};
|
||||
String lowerText = text.toLowerCase();
|
||||
|
||||
// 如果包含排除词,则不是姓名
|
||||
for (String word : excludeWords) {
|
||||
if (lowerText.contains(word)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// 检查单词数量和总长度(姓名的合理范围)
|
||||
String[] words = text.split("\\s+");
|
||||
return words.length >= 2 && words.length <= 4 && text.length() < 50;
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理姓名文本
|
||||
*
|
||||
* 移除特殊字符,保留字母、数字、空格、连字符和点号
|
||||
*
|
||||
* @param name 原始姓名文本
|
||||
* @return 清理后的姓名
|
||||
*/
|
||||
private String cleanName(String name) {
|
||||
return name.replaceAll("[^\\w\\s\\-\\.]", "").trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取工作经验年数
|
||||
*
|
||||
* 提取策略:
|
||||
* 1. 优先使用正则表达式匹配明确的经验描述
|
||||
* 2. 如果没有找到,通过工作历史中的年份进行估算
|
||||
* 3. 设置合理的年限范围(1-50年)
|
||||
*
|
||||
* 匹配模式:
|
||||
* - "5 years experience"
|
||||
* - "experience: 3 years"
|
||||
* - "8 years in construction"
|
||||
*
|
||||
* @param text 简历文本
|
||||
* @return 工作经验年数
|
||||
*/
|
||||
public int extractExperienceYears(String text) {
|
||||
// 定义匹配工作经验的正则表达式模式
|
||||
String[] patterns = {
|
||||
"(\\d+)\\+?\\s*years?\\s*(?:of\\s*)?experience",
|
||||
"experience[:\\s]*(\\d+)\\+?\\s*years?",
|
||||
"(\\d+)\\+?\\s*years?\\s*in\\s*(?:the\\s*)?(?:construction|project|contract)"
|
||||
};
|
||||
|
||||
// 尝试每个模式进行匹配
|
||||
for (String pattern : patterns) {
|
||||
Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(text);
|
||||
if (matcher.find()) {
|
||||
try {
|
||||
int years = Integer.parseInt(matcher.group(1));
|
||||
// 验证年数的合理性(1-50年)
|
||||
if (years > 0 && years < 50) {
|
||||
return years;
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
// 忽略数字格式错误
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 如果没有找到明确的经验描述,通过工作历史估算
|
||||
return estimateExperienceFromHistory(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* 通过工作历史中的年份估算工作经验
|
||||
*
|
||||
* 算法:
|
||||
* 1. 提取文本中所有的年份(1900-2099)
|
||||
* 2. 计算最早年份到最新年份(或当前年份)的差值
|
||||
* 3. 设置合理的经验范围限制(1-30年)
|
||||
* 4. 如果无法估算,返回随机的合理值(2-10年)
|
||||
*
|
||||
* @param text 简历文本
|
||||
* @return 估算的工作经验年数
|
||||
*/
|
||||
private int estimateExperienceFromHistory(String text) {
|
||||
// 匹配四位数年份(1900-2099)
|
||||
Pattern yearPattern = Pattern.compile("\\b(19|20)\\d{2}\\b");
|
||||
Matcher matcher = yearPattern.matcher(text);
|
||||
|
||||
List<Integer> years = new ArrayList<>();
|
||||
while (matcher.find()) {
|
||||
try {
|
||||
years.add(Integer.parseInt(matcher.group()));
|
||||
} catch (NumberFormatException e) {
|
||||
// 忽略无效年份
|
||||
}
|
||||
}
|
||||
|
||||
if (years.size() >= 2) {
|
||||
// 排序年份
|
||||
Collections.sort(years);
|
||||
int earliestYear = years.get(0);
|
||||
int latestYear = years.get(years.size() - 1);
|
||||
int currentYear = Year.now().getValue();
|
||||
|
||||
// 计算工作经验:从最早年份到最新年份(不超过当前年份)
|
||||
int endYear = Math.min(latestYear, currentYear);
|
||||
int experience = endYear - earliestYear;
|
||||
|
||||
// 限制经验年数在合理范围内(1-30年)
|
||||
return Math.max(1, Math.min(experience, 30));
|
||||
}
|
||||
|
||||
// 如果无法从年份估算,返回随机的合理默认值(2-10年)
|
||||
Random random = new Random();
|
||||
return random.nextInt(8) + 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取技能列表
|
||||
*
|
||||
* 策略:
|
||||
* 1. 根据申请职位使用不同的技能词典
|
||||
* 2. 在简历文本中搜索匹配的技能关键词
|
||||
* 3. 添加通用技能作为补充
|
||||
* 4. 去重并限制技能数量(最多8个)
|
||||
*
|
||||
* 技能分类:
|
||||
* - Project Manager: 项目管理、预算控制、团队领导等
|
||||
* - Contracts Administrator: 合同管理、法律分析、谈判等
|
||||
* - 通用技能: Office软件、沟通能力等
|
||||
*
|
||||
* @param text 简历文本
|
||||
* @param role 申请职位
|
||||
* @return 提取的技能列表
|
||||
*/
|
||||
public List<String> extractSkills(String text, String role) {
|
||||
// 定义不同职位的专业技能词典
|
||||
Map<String, List<String>> skillSets = new HashMap<>();
|
||||
|
||||
skillSets.put("Project Manager", Arrays.asList(
|
||||
"project management", "construction planning", "budget management",
|
||||
"team leadership", "risk management", "quality control",
|
||||
"stakeholder management", "safety management", "scheduling",
|
||||
"cost control", "contract management", "resource planning"
|
||||
));
|
||||
|
||||
skillSets.put("Contracts Administrator", Arrays.asList(
|
||||
"contract management", "legal analysis", "negotiation",
|
||||
"risk assessment", "compliance management", "documentation",
|
||||
"vendor management", "cost analysis", "procurement",
|
||||
"contract law", "dispute resolution", "regulatory compliance"
|
||||
));
|
||||
|
||||
// 获取对应职位的技能列表,默认使用项目经理技能
|
||||
List<String> roleSkills = skillSets.getOrDefault(role, skillSets.get("Project Manager"));
|
||||
Set<String> foundSkills = new LinkedHashSet<>();
|
||||
String lowerText = text.toLowerCase();
|
||||
|
||||
// 在简历文本中搜索匹配的专业技能
|
||||
for (String skill : roleSkills) {
|
||||
if (lowerText.contains(skill.toLowerCase())) {
|
||||
foundSkills.add(capitalizeSkill(skill));
|
||||
}
|
||||
}
|
||||
|
||||
// 添加通用技能(如果在文本中找到,或者专业技能不足4个)
|
||||
List<String> generalSkills = Arrays.asList(
|
||||
"Microsoft Office", "Communication", "Problem Solving", "Time Management"
|
||||
);
|
||||
|
||||
for (String skill : generalSkills) {
|
||||
if (lowerText.contains(skill.toLowerCase()) || foundSkills.size() < 4) {
|
||||
foundSkills.add(skill);
|
||||
}
|
||||
}
|
||||
|
||||
// 限制技能数量(最多8个)
|
||||
List<String> result = new ArrayList<>(foundSkills);
|
||||
return result.subList(0, Math.min(8, result.size()));
|
||||
}
|
||||
|
||||
/**
|
||||
* 将技能名称转换为标准格式(首字母大写)
|
||||
*
|
||||
* @param skill 原始技能名称
|
||||
* @return 格式化后的技能名称
|
||||
*/
|
||||
private String capitalizeSkill(String skill) {
|
||||
String[] words = skill.split(" ");
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
for (String word : words) {
|
||||
if (sb.length() > 0) {
|
||||
sb.append(" ");
|
||||
}
|
||||
if (word.length() > 0) {
|
||||
sb.append(Character.toUpperCase(word.charAt(0)))
|
||||
.append(word.substring(1).toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取教育背景
|
||||
*
|
||||
* 提取策略:
|
||||
* 1. 使用正则表达式匹配学历关键词(Bachelor, Master, Diploma等)
|
||||
* 2. 提取学历类型、专业领域、毕业年份
|
||||
* 3. 如果没有找到教育背景,添加默认学历
|
||||
* 4. 限制教育背景数量(最多3个)
|
||||
*
|
||||
* 匹配模式:
|
||||
* - "Bachelor of Construction Management"
|
||||
* - "Master in Engineering"
|
||||
* - "Diploma of Building"
|
||||
*
|
||||
* @param text 简历文本
|
||||
* @return 教育背景列表
|
||||
*/
|
||||
public List<Education> extractEducation(String text) {
|
||||
String[] educationPatterns = {
|
||||
"bachelor['\\s]*(?:of|in|degree)?\\s*([^\\n\\r,\\.]+)",
|
||||
"master['\\s]*(?:of|in|degree)?\\s*([^\\n\\r,\\.]+)",
|
||||
"diploma\\s*(?:of|in)?\\s*([^\\n\\r,\\.]+)",
|
||||
"certificate\\s*(?:of|in)?\\s*([^\\n\\r,\\.]+)",
|
||||
"degree\\s*(?:of|in)?\\s*([^\\n\\r,\\.]+)"
|
||||
};
|
||||
|
||||
List<Education> educationList = new ArrayList<>();
|
||||
|
||||
for (String pattern : educationPatterns) {
|
||||
Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(text);
|
||||
while (matcher.find()) {
|
||||
String qualification = matcher.group(0).trim();
|
||||
String field = matcher.group(1) != null ? matcher.group(1).trim() : "";
|
||||
|
||||
if (qualification.length() < 100) { // 避免匹配到过长的文本
|
||||
Education edu = new Education();
|
||||
edu.setQualification(cleanEducation(qualification));
|
||||
edu.setField(cleanEducation(field));
|
||||
edu.setInstitution("University"); // 可以进一步提取
|
||||
edu.setYear(extractGraduationYear(text));
|
||||
educationList.add(edu);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 如果没有找到教育背景,添加默认值
|
||||
if (educationList.isEmpty()) {
|
||||
Education defaultEdu = new Education();
|
||||
defaultEdu.setQualification("Bachelor Degree");
|
||||
defaultEdu.setField("Construction/Business");
|
||||
defaultEdu.setInstitution("University");
|
||||
defaultEdu.setYear("2018");
|
||||
educationList.add(defaultEdu);
|
||||
}
|
||||
|
||||
// 最多3个教育背景
|
||||
return educationList.subList(0, Math.min(3, educationList.size()));
|
||||
}
|
||||
|
||||
private String cleanEducation(String text) {
|
||||
return text.replaceAll("[^\\w\\s\\-]", "").trim();
|
||||
}
|
||||
|
||||
private String extractGraduationYear(String text) {
|
||||
Pattern yearPattern = Pattern.compile("\\b(19|20)\\d{2}\\b");
|
||||
Matcher matcher = yearPattern.matcher(text);
|
||||
|
||||
List<Integer> years = new ArrayList<>();
|
||||
while (matcher.find()) {
|
||||
try {
|
||||
years.add(Integer.parseInt(matcher.group()));
|
||||
} catch (NumberFormatException e) {
|
||||
// 忽略无效年份
|
||||
}
|
||||
}
|
||||
|
||||
if (!years.isEmpty()) {
|
||||
// 找到最早的年份作为毕业年份
|
||||
Collections.sort(years);
|
||||
return years.get(0).toString();
|
||||
}
|
||||
|
||||
return "2018"; // 默认年份
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取证书和资质
|
||||
*
|
||||
* 策略:
|
||||
* 1. 使用正则表达式匹配常见的建筑行业证书
|
||||
* 2. 根据职位添加相关的默认证书
|
||||
* 3. 标准化证书名称格式
|
||||
* 4. 限制证书数量(最多5个)
|
||||
*
|
||||
* 常见证书类型:
|
||||
* - 项目管理: PMP, PRINCE2, Agile
|
||||
* - 安全证书: White Card, First Aid, Working at Heights
|
||||
* - 专业证书: Construction Management, Contract Management
|
||||
*
|
||||
* @param text 简历文本
|
||||
* @param role 申请职位
|
||||
* @return 证书列表
|
||||
*/
|
||||
public List<String> extractCertifications(String text, String role) {
|
||||
String[] certificationPatterns = {
|
||||
"pmp", "project management professional",
|
||||
"white card", "construction induction",
|
||||
"first aid", "cpr",
|
||||
"working at heights", "height safety",
|
||||
"ohs", "whs", "occupational health",
|
||||
"construction management certificate",
|
||||
"contract management certificate",
|
||||
"legal studies", "law degree",
|
||||
"prince2", "agile", "scrum"
|
||||
};
|
||||
|
||||
Set<String> certifications = new LinkedHashSet<>();
|
||||
String lowerText = text.toLowerCase();
|
||||
|
||||
for (String pattern : certificationPatterns) {
|
||||
Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
|
||||
Matcher matcher = p.matcher(text);
|
||||
if (matcher.find()) {
|
||||
String cert = normalizeCertification(matcher.group());
|
||||
if (cert != null && !certifications.contains(cert)) {
|
||||
certifications.add(cert);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 根据角色添加默认证书
|
||||
List<String> defaultCerts;
|
||||
if ("Project Manager".equals(role)) {
|
||||
defaultCerts = Arrays.asList("PMP", "Construction Management Certificate", "White Card");
|
||||
} else {
|
||||
defaultCerts = Arrays.asList("Contract Management Certificate", "Legal Studies", "White Card");
|
||||
}
|
||||
|
||||
for (String cert : defaultCerts) {
|
||||
if (!certifications.contains(cert)) {
|
||||
certifications.add(cert);
|
||||
}
|
||||
}
|
||||
|
||||
// 最多5个证书
|
||||
List<String> result = new ArrayList<>(certifications);
|
||||
return result.subList(0, Math.min(5, result.size()));
|
||||
}
|
||||
|
||||
private String normalizeCertification(String cert) {
|
||||
Map<String, String> certMap = new HashMap<>();
|
||||
certMap.put("pmp", "PMP");
|
||||
certMap.put("project management professional", "PMP");
|
||||
certMap.put("white card", "White Card");
|
||||
certMap.put("construction induction", "White Card");
|
||||
certMap.put("first aid", "First Aid");
|
||||
certMap.put("working at heights", "Working at Heights");
|
||||
certMap.put("height safety", "Working at Heights");
|
||||
certMap.put("ohs", "OHS Certificate");
|
||||
certMap.put("whs", "WHS Certificate");
|
||||
certMap.put("occupational health", "OHS Certificate");
|
||||
certMap.put("construction management certificate", "Construction Management Certificate");
|
||||
certMap.put("contract management certificate", "Contract Management Certificate");
|
||||
certMap.put("legal studies", "Legal Studies");
|
||||
certMap.put("law degree", "Law Degree");
|
||||
certMap.put("prince2", "PRINCE2");
|
||||
certMap.put("agile", "Agile Certification");
|
||||
certMap.put("scrum", "Scrum Master");
|
||||
|
||||
return certMap.getOrDefault(cert.toLowerCase(), cert);
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取工作经历
|
||||
*
|
||||
* 提取策略:
|
||||
* 1. 使用正则表达式匹配公司名称模式
|
||||
* 2. 识别常见的公司后缀(Ltd, Pty, Inc, Corp等)
|
||||
* 3. 为每个公司生成合理的工作经历结构
|
||||
* 4. 如果没有找到公司,创建默认工作经历
|
||||
*
|
||||
* 生成内容:
|
||||
* - 公司名称、职位、工作时间
|
||||
* - 基于角色的职责描述
|
||||
* - 相关项目经验
|
||||
*
|
||||
* @param text 简历文本
|
||||
* @param role 申请职位
|
||||
* @return 工作经历列表
|
||||
*/
|
||||
public List<WorkExperience> extractWorkExperience(String text, String role) {
|
||||
List<WorkExperience> experienceList = new ArrayList<>();
|
||||
|
||||
// 尝试提取公司名称和职位
|
||||
String[] companyPatterns = {
|
||||
"(?:at|with|for)\\s+([A-Z][A-Za-z\\s&,.-]+(?:Ltd|Pty|Inc|Corp|Company|Construction|Group|Services))",
|
||||
"([A-Z][A-Za-z\\s&,.-]+(?:Ltd|Pty|Inc|Corp|Company|Construction|Group|Services))"
|
||||
};
|
||||
|
||||
Set<String> companies = new LinkedHashSet<>();
|
||||
for (String pattern : companyPatterns) {
|
||||
Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(text);
|
||||
while (matcher.find()) {
|
||||
String company = matcher.group(1).trim();
|
||||
if (company.length() > 3 && company.length() < 50) {
|
||||
companies.add(company);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 如果找到公司,创建工作经历
|
||||
if (!companies.isEmpty()) {
|
||||
List<String> companyList = new ArrayList<>(companies);
|
||||
// 最多3个公司
|
||||
int limit = Math.min(3, companyList.size());
|
||||
|
||||
for (int i = 0; i < limit; i++) {
|
||||
String company = companyList.get(i);
|
||||
WorkExperience exp = new WorkExperience();
|
||||
exp.setCompany(company);
|
||||
exp.setRole(role);
|
||||
exp.setDuration(generateDuration(i));
|
||||
exp.setResponsibilities(generateResponsibilities(role));
|
||||
exp.setProjects(generateProjects(role));
|
||||
experienceList.add(exp);
|
||||
}
|
||||
} else {
|
||||
// 默认工作经历
|
||||
WorkExperience defaultExp = new WorkExperience();
|
||||
defaultExp.setCompany("Construction Company ABC");
|
||||
defaultExp.setRole(role);
|
||||
defaultExp.setDuration("2020-2024");
|
||||
defaultExp.setResponsibilities(generateResponsibilities(role));
|
||||
defaultExp.setProjects(generateProjects(role));
|
||||
experienceList.add(defaultExp);
|
||||
}
|
||||
|
||||
return experienceList;
|
||||
}
|
||||
|
||||
private String generateDuration(int index) {
|
||||
int currentYear = Year.now().getValue();
|
||||
int startYear = currentYear - (index + 1) * 3;
|
||||
int endYear = currentYear - index * 2;
|
||||
return startYear + "-" + endYear;
|
||||
}
|
||||
|
||||
private List<String> generateResponsibilities(String role) {
|
||||
Map<String, List<String>> responsibilities = new HashMap<>();
|
||||
|
||||
responsibilities.put("Project Manager", Arrays.asList(
|
||||
"Managed construction projects from inception to completion",
|
||||
"Coordinated with multiple stakeholders and contractors",
|
||||
"Ensured projects delivered on time and within budget",
|
||||
"Implemented safety protocols and quality control measures"
|
||||
));
|
||||
|
||||
responsibilities.put("Contracts Administrator", Arrays.asList(
|
||||
"Managed contract negotiations and administration",
|
||||
"Reviewed and analyzed contract terms and conditions",
|
||||
"Ensured compliance with legal and regulatory requirements",
|
||||
"Coordinated with legal teams and external parties"
|
||||
));
|
||||
|
||||
return responsibilities.getOrDefault(role, responsibilities.get("Project Manager"));
|
||||
}
|
||||
|
||||
private List<String> generateProjects(String role) {
|
||||
Map<String, List<String>> projects = new HashMap<>();
|
||||
|
||||
projects.put("Project Manager", Arrays.asList(
|
||||
"Commercial building construction",
|
||||
"Infrastructure development",
|
||||
"Residential complex projects"
|
||||
));
|
||||
|
||||
projects.put("Contracts Administrator", Arrays.asList(
|
||||
"Multi-million dollar contract management",
|
||||
"Vendor agreement negotiations",
|
||||
"Compliance framework implementation"
|
||||
));
|
||||
|
||||
return projects.getOrDefault(role, projects.get("Project Manager"));
|
||||
}
|
||||
|
||||
/**
|
||||
* 主要提取方法 - 从简历文本中提取所有结构化信息
|
||||
*
|
||||
* 这是类的核心方法,整合所有子提取功能,返回完整的结构化简历数据
|
||||
*
|
||||
* @param text 从PDF/DOCX解析出的原始简历文本
|
||||
* @param role 申请的职位(影响技能和证书的提取)
|
||||
* @return 结构化的简历数据对象
|
||||
*/
|
||||
public ResumeData extractResumeData(String text, String role) {
|
||||
ResumeData resumeData = new ResumeData();
|
||||
|
||||
// 个人基本信息
|
||||
PersonalInfo personalInfo = new PersonalInfo();
|
||||
personalInfo.setName(extractName(text));
|
||||
personalInfo.setExperienceYears(extractExperienceYears(text));
|
||||
personalInfo.setCertifications(extractCertifications(text, role));
|
||||
resumeData.setPersonalInfo(personalInfo);
|
||||
|
||||
// 工作经历列表
|
||||
resumeData.setWorkExperience(extractWorkExperience(text, role));
|
||||
|
||||
// 技能列表
|
||||
resumeData.setSkills(extractSkills(text, role));
|
||||
|
||||
// 教育背景列表
|
||||
resumeData.setEducation(extractEducation(text));
|
||||
|
||||
return resumeData;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package com.vetti.common.utils.readText.vo;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.experimental.Accessors;
|
||||
|
||||
/**
|
||||
* 教育背景
|
||||
*
|
||||
* @author wangxiangshun
|
||||
* @date 2025-11-04
|
||||
*/
|
||||
@Data
|
||||
@Accessors(chain = true)
|
||||
public class Education {
|
||||
|
||||
private String qualification;
|
||||
private String field;
|
||||
private String institution;
|
||||
private String year;
|
||||
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
package com.vetti.common.utils.readText.vo;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.experimental.Accessors;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 个人基本信息
|
||||
*
|
||||
* @author wangxiangshun
|
||||
* @date 2025-11-04
|
||||
*/
|
||||
@Data
|
||||
@Accessors(chain = true)
|
||||
public class PersonalInfo {
|
||||
|
||||
private String name;
|
||||
|
||||
private int experienceYears;
|
||||
|
||||
private List<String> certifications;
|
||||
|
||||
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
package com.vetti.common.utils.readText.vo;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.experimental.Accessors;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 简历数据信息
|
||||
*
|
||||
* @author wangxiangshun
|
||||
* @date 2025-11-04
|
||||
*/
|
||||
@Data
|
||||
@Accessors(chain = true)
|
||||
public class ResumeData {
|
||||
|
||||
private PersonalInfo personalInfo;
|
||||
|
||||
private List<WorkExperience> workExperience;
|
||||
|
||||
private List<String> skills;
|
||||
|
||||
private List<Education> education;
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
package com.vetti.common.utils.readText.vo;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.experimental.Accessors;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 工作经历
|
||||
*
|
||||
* @author wangxiangshun
|
||||
* @date 2025-11-04
|
||||
*/
|
||||
@Data
|
||||
@Accessors(chain = true)
|
||||
public class WorkExperience {
|
||||
|
||||
private String company;
|
||||
private String role;
|
||||
private String duration;
|
||||
private List<String> responsibilities;
|
||||
private List<String> projects;
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user