简历读取基础逻辑添加以及用户语音配置信息字段添加

2025-11-04 19:47:20 +08:00
parent 1e67460f0a
commit 86e2fe238b
18 changed files with 1323 additions and 6 deletions
--- a/vetti-common/pom.xml
+++ b/vetti-common/pom.xml
@@ -184,6 +184,31 @@
            <artifactId>twilio</artifactId>
        </dependency>

+        <dependency>
+            <groupId>net.sourceforge.tess4j</groupId>
+            <artifactId>tess4j</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>org.jsoup</groupId>
+            <artifactId>jsoup</artifactId>
+        </dependency>
+
+        <!-- Apache POI HWPF -->
+        <dependency>
+            <groupId>org.apache.poi</groupId>
+            <artifactId>poi</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.poi</groupId>
+            <artifactId>poi-scratchpad</artifactId>
+        </dependency>
+
+        <!-- Maven -->
+        <dependency>
+            <groupId>org.jsoup</groupId>
+            <artifactId>jsoup</artifactId>
+        </dependency>

    </dependencies>

--- a/vetti-common/src/main/java/com/vetti/common/core/domain/entity/SysUser.java
+++ b/vetti-common/src/main/java/com/vetti/common/core/domain/entity/SysUser.java
@@ -127,6 +127,9 @@ public class SysUser extends BaseEntity
    @ApiModelProperty("用户标识(1:新用户,2:老用户)")
    private String userFlag;

+    @ApiModelProperty("用户语音配置信息")
+    private String userSetJson;
+
    /** 部门对象 */
    @Excels({
        @Excel(name = "部门名称", targetAttr = "deptName", type = Type.EXPORT),
@@ -462,6 +465,14 @@ public class SysUser extends BaseEntity
        this.userFlag = userFlag;
    }

+    public String getUserSetJson() {
+        return userSetJson;
+    }
+
+    public void setUserSetJson(String userSetJson) {
+        this.userSetJson = userSetJson;
+    }
+
    @Override
    public String toString() {
        return new ToStringBuilder(this,ToStringStyle.MULTI_LINE_STYLE)
--- a/vetti-common/src/main/java/com/vetti/common/utils/readFile/FileContentUtil.java
+++ b/vetti-common/src/main/java/com/vetti/common/utils/readFile/FileContentUtil.java
@@ -0,0 +1,119 @@
+package com.vetti.common.utils.readFile;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.rendering.ImageType;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+import net.sourceforge.tess4j.ITesseract;
+import net.sourceforge.tess4j.Tesseract;
+import net.sourceforge.tess4j.TesseractException;
+
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Objects;
+
+public  class FileContentUtil {
+
+    private FileContentUtil() {}
+
+    /**
+     * 读取不同类型文件的文本内容。
+     *
+     * @param is                 输入流（由调用方负责关闭）
+     * @param fileExtension      文件扩展名（小写，例如：txt、pdf、docx、doc、html）
+     * @return                   提取到的文本
+     * @throws IOException       IO 相关异常
+     */
+    public static String readFileContent(InputStream is, String fileExtension) throws IOException {
+        return readFileContent(is, fileExtension, null);
+    }
+
+    /**
+     * 读取不同类型文件的文本内容，支持在 PDF 文本为空时进行 OCR。
+     *
+     * @param is                 输入流（由调用方负责关闭）
+     * @param fileExtension      文件扩展名（小写，例如：txt、pdf、docx、doc、html）
+     * @param tesseractDatapath  Tesseract 数据路径（可选；为空则不设置）
+     * @return                   提取到的文本
+     * @throws IOException       IO 相关异常
+     */
+    public static String readFileContent(InputStream is, String fileExtension, String tesseractDatapath) throws IOException {
+        Objects.requireNonNull(is, "InputStream cannot be null");
+        Objects.requireNonNull(fileExtension, "fileExtension cannot be null");
+
+        switch (fileExtension) {
+            case "txt": {
+                byte[] bytes = toByteArray(is);
+                return new String(bytes, StandardCharsets.UTF_8);
+            }
+            case "pdf": {
+                try (PDDocument doc = PDDocument.load(is)) {
+                    PDFTextStripper textStripper = new PDFTextStripper();
+                    String str = textStripper.getText(doc);
+//                    str = str.replace("\r", "").replace("\n", "");
+                    if (StringUtils.isEmpty(str)) {
+                        int pageCount = doc.getNumberOfPages();
+                        if (pageCount > 0) {
+                            PDFRenderer renderer = new PDFRenderer(doc);
+                            ITesseract tesseract = new Tesseract();
+                            if (tesseractDatapath != null) {
+                                tesseract.setDatapath(tesseractDatapath);
+                            }
+                            tesseract.setLanguage("eng+chi_sim");
+                            StringBuilder fullText = new StringBuilder();
+                            for (int i = 0; i < pageCount; i++) {
+                                BufferedImage image = renderer.renderImageWithDPI(i, 300, ImageType.BINARY);
+                                try {
+                                    String pageText = tesseract.doOCR(image);
+                                    fullText.append(pageText).append("\n\n");
+                                } catch (TesseractException e) {
+                                    throw new RuntimeException(e);
+                                }
+                            }
+                            str = fullText.toString();
+                        }
+                    }
+                    return str;
+                }
+            }
+            case "docx": {
+                try (XWPFDocument xdoc = new XWPFDocument(is);
+                     XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc)) {
+                    return extractor.getText();
+                }
+            }
+            case "doc": {
+                try (WordExtractor extractor = new WordExtractor(is)) {
+                    return extractor.getText();
+                }
+            }
+            case "html": {
+                // 直接从 InputStream 解析 HTML，避免中间落地文件
+                Document doc = Jsoup.parse(is, "UTF-8", "");
+                return doc.body() != null ? doc.body().html() : "";
+            }
+            default:
+                return "";
+        }
+    }
+
+    private static byte[] toByteArray(InputStream is) throws IOException {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream(Math.max(32, is.available()));
+        byte[] buf = new byte[8192];
+        int len;
+        while ((len = is.read(buf)) != -1) {
+            bos.write(buf, 0, len);
+        }
+        return bos.toByteArray();
+    }
+}
--- a/vetti-common/src/main/java/com/vetti/common/utils/readText/ResumeTextExtractor.java
+++ b/vetti-common/src/main/java/com/vetti/common/utils/readText/ResumeTextExtractor.java
@@ -0,0 +1,598 @@
+package com.vetti.common.utils.readText;
+
+import com.vetti.common.utils.readText.vo.Education;
+import com.vetti.common.utils.readText.vo.PersonalInfo;
+import com.vetti.common.utils.readText.vo.ResumeData;
+import com.vetti.common.utils.readText.vo.WorkExperience;
+
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.time.Year;
+
+/**
+ * 简历文本特征提取器
+ *
+ * 功能：从PDF/DOCX解析出的原始文本中提取结构化的简历信息
+ *
+ * 主要提取内容：
+ * - 个人信息（姓名、工作年限、证书）
+ * - 工作经历（公司、职位、职责、项目）
+ * - 技能列表（基于角色的相关技能）
+ * - 教育背景（学历、专业、毕业年份）
+ *
+ * 使用场景：
+ * 1. 简历预处理 - 将非结构化文本转换为结构化数据
+ * 2. 特征工程 - 为后续的AI评估提供标准化输入
+ * 3. 数据清洗 - 过滤和规范化提取的信息
+ */
+public class ResumeTextExtractor {
+
+    /**
+     * 提取候选人姓名
+     *
+     * 策略：
+     * 1. 扫描简历文本的前5行（姓名通常在顶部）
+     * 2. 使用启发式规则识别可能的姓名
+     * 3. 过滤掉常见的标题词（如"Resume", "CV"等）
+     *
+     * @param text 简历的原始文本内容
+     * @return 提取的姓名，如果未找到则返回'Unknown'
+     */
+    public String extractName(String text) {
+        // 按行分割文本，过滤空行
+        String[] lines = text.split("\n");
+        List<String> nonEmptyLines = new ArrayList<>();
+        for (String line : lines) {
+            if (!line.trim().isEmpty()) {
+                nonEmptyLines.add(line.trim());
+            }
+        }
+
+        // 遍历前5行寻找姓名（姓名通常在简历顶部）
+        int limit = Math.min(5, nonEmptyLines.size());
+        for (int i = 0; i < limit; i++) {
+            String line = nonEmptyLines.get(i);
+            if (isLikelyName(line)) {
+                return cleanName(line);
+            }
+        }
+
+        return "Unknown";
+    }
+
+    /**
+     * 判断文本是否可能是姓名
+     *
+     * 启发式规则：
+     * 1. 不包含简历相关的关键词
+     * 2. 单词数量在2-4个之间（名字+姓氏的合理范围）
+     * 3. 总长度小于50个字符
+     *
+     * @param text 待判断的文本行
+     * @return 是否可能是姓名
+     */
+    private boolean isLikelyName(String text) {
+        // 排除常见的简历标题词
+        String[] excludeWords = {"resume", "cv", "curriculum", "vitae", "profile", "summary", "objective"};
+        String lowerText = text.toLowerCase();
+
+        // 如果包含排除词，则不是姓名
+        for (String word : excludeWords) {
+            if (lowerText.contains(word)) {
+                return false;
+            }
+        }
+
+        // 检查单词数量和总长度（姓名的合理范围）
+        String[] words = text.split("\\s+");
+        return words.length >= 2 && words.length <= 4 && text.length() < 50;
+    }
+
+    /**
+     * 清理姓名文本
+     *
+     * 移除特殊字符，保留字母、数字、空格、连字符和点号
+     *
+     * @param name 原始姓名文本
+     * @return 清理后的姓名
+     */
+    private String cleanName(String name) {
+        return name.replaceAll("[^\\w\\s\\-\\.]", "").trim();
+    }
+
+    /**
+     * 提取工作经验年数
+     *
+     * 提取策略：
+     * 1. 优先使用正则表达式匹配明确的经验描述
+     * 2. 如果没有找到，通过工作历史中的年份进行估算
+     * 3. 设置合理的年限范围（1-50年）
+     *
+     * 匹配模式：
+     * - "5 years experience"
+     * - "experience: 3 years"
+     * - "8 years in construction"
+     *
+     * @param text 简历文本
+     * @return 工作经验年数
+     */
+    public int extractExperienceYears(String text) {
+        // 定义匹配工作经验的正则表达式模式
+        String[] patterns = {
+                "(\\d+)\\+?\\s*years?\\s*(?:of\\s*)?experience",
+                "experience[:\\s]*(\\d+)\\+?\\s*years?",
+                "(\\d+)\\+?\\s*years?\\s*in\\s*(?:the\\s*)?(?:construction|project|contract)"
+        };
+
+        // 尝试每个模式进行匹配
+        for (String pattern : patterns) {
+            Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(text);
+            if (matcher.find()) {
+                try {
+                    int years = Integer.parseInt(matcher.group(1));
+                    // 验证年数的合理性（1-50年）
+                    if (years > 0 && years < 50) {
+                        return years;
+                    }
+                } catch (NumberFormatException e) {
+                    // 忽略数字格式错误
+                }
+            }
+        }
+
+        // 如果没有找到明确的经验描述，通过工作历史估算
+        return estimateExperienceFromHistory(text);
+    }
+
+    /**
+     * 通过工作历史中的年份估算工作经验
+     *
+     * 算法：
+     * 1. 提取文本中所有的年份（1900-2099）
+     * 2. 计算最早年份到最新年份（或当前年份）的差值
+     * 3. 设置合理的经验范围限制（1-30年）
+     * 4. 如果无法估算，返回随机的合理值（2-10年）
+     *
+     * @param text 简历文本
+     * @return 估算的工作经验年数
+     */
+    private int estimateExperienceFromHistory(String text) {
+        // 匹配四位数年份（1900-2099）
+        Pattern yearPattern = Pattern.compile("\\b(19|20)\\d{2}\\b");
+        Matcher matcher = yearPattern.matcher(text);
+
+        List<Integer> years = new ArrayList<>();
+        while (matcher.find()) {
+            try {
+                years.add(Integer.parseInt(matcher.group()));
+            } catch (NumberFormatException e) {
+                // 忽略无效年份
+            }
+        }
+
+        if (years.size() >= 2) {
+            // 排序年份
+            Collections.sort(years);
+            int earliestYear = years.get(0);
+            int latestYear = years.get(years.size() - 1);
+            int currentYear = Year.now().getValue();
+
+            // 计算工作经验：从最早年份到最新年份（不超过当前年份）
+            int endYear = Math.min(latestYear, currentYear);
+            int experience = endYear - earliestYear;
+
+            // 限制经验年数在合理范围内（1-30年）
+            return Math.max(1, Math.min(experience, 30));
+        }
+
+        // 如果无法从年份估算，返回随机的合理默认值（2-10年）
+        Random random = new Random();
+        return random.nextInt(8) + 2;
+    }
+
+    /**
+     * 提取技能列表
+     *
+     * 策略：
+     * 1. 根据申请职位使用不同的技能词典
+     * 2. 在简历文本中搜索匹配的技能关键词
+     * 3. 添加通用技能作为补充
+     * 4. 去重并限制技能数量（最多8个）
+     *
+     * 技能分类：
+     * - Project Manager: 项目管理、预算控制、团队领导等
+     * - Contracts Administrator: 合同管理、法律分析、谈判等
+     * - 通用技能: Office软件、沟通能力等
+     *
+     * @param text 简历文本
+     * @param role 申请职位
+     * @return 提取的技能列表
+     */
+    public List<String> extractSkills(String text, String role) {
+        // 定义不同职位的专业技能词典
+        Map<String, List<String>> skillSets = new HashMap<>();
+
+        skillSets.put("Project Manager", Arrays.asList(
+                "project management", "construction planning", "budget management",
+                "team leadership", "risk management", "quality control",
+                "stakeholder management", "safety management", "scheduling",
+                "cost control", "contract management", "resource planning"
+        ));
+
+        skillSets.put("Contracts Administrator", Arrays.asList(
+                "contract management", "legal analysis", "negotiation",
+                "risk assessment", "compliance management", "documentation",
+                "vendor management", "cost analysis", "procurement",
+                "contract law", "dispute resolution", "regulatory compliance"
+        ));
+
+        // 获取对应职位的技能列表，默认使用项目经理技能
+        List<String> roleSkills = skillSets.getOrDefault(role, skillSets.get("Project Manager"));
+        Set<String> foundSkills = new LinkedHashSet<>();
+        String lowerText = text.toLowerCase();
+
+        // 在简历文本中搜索匹配的专业技能
+        for (String skill : roleSkills) {
+            if (lowerText.contains(skill.toLowerCase())) {
+                foundSkills.add(capitalizeSkill(skill));
+            }
+        }
+
+        // 添加通用技能（如果在文本中找到，或者专业技能不足4个）
+        List<String> generalSkills = Arrays.asList(
+                "Microsoft Office", "Communication", "Problem Solving", "Time Management"
+        );
+
+        for (String skill : generalSkills) {
+            if (lowerText.contains(skill.toLowerCase()) || foundSkills.size() < 4) {
+                foundSkills.add(skill);
+            }
+        }
+
+        // 限制技能数量（最多8个）
+        List<String> result = new ArrayList<>(foundSkills);
+        return result.subList(0, Math.min(8, result.size()));
+    }
+
+    /**
+     * 将技能名称转换为标准格式（首字母大写）
+     *
+     * @param skill 原始技能名称
+     * @return 格式化后的技能名称
+     */
+    private String capitalizeSkill(String skill) {
+        String[] words = skill.split(" ");
+        StringBuilder sb = new StringBuilder();
+
+        for (String word : words) {
+            if (sb.length() > 0) {
+                sb.append(" ");
+            }
+            if (word.length() > 0) {
+                sb.append(Character.toUpperCase(word.charAt(0)))
+                        .append(word.substring(1).toLowerCase());
+            }
+        }
+
+        return sb.toString();
+    }
+
+    /**
+     * 提取教育背景
+     *
+     * 提取策略：
+     * 1. 使用正则表达式匹配学历关键词（Bachelor, Master, Diploma等）
+     * 2. 提取学历类型、专业领域、毕业年份
+     * 3. 如果没有找到教育背景，添加默认学历
+     * 4. 限制教育背景数量（最多3个）
+     *
+     * 匹配模式：
+     * - "Bachelor of Construction Management"
+     * - "Master in Engineering"
+     * - "Diploma of Building"
+     *
+     * @param text 简历文本
+     * @return 教育背景列表
+     */
+    public List<Education> extractEducation(String text) {
+        String[] educationPatterns = {
+                "bachelor['\\s]*(?:of|in|degree)?\\s*([^\\n\\r,\\.]+)",
+                "master['\\s]*(?:of|in|degree)?\\s*([^\\n\\r,\\.]+)",
+                "diploma\\s*(?:of|in)?\\s*([^\\n\\r,\\.]+)",
+                "certificate\\s*(?:of|in)?\\s*([^\\n\\r,\\.]+)",
+                "degree\\s*(?:of|in)?\\s*([^\\n\\r,\\.]+)"
+        };
+
+        List<Education> educationList = new ArrayList<>();
+
+        for (String pattern : educationPatterns) {
+            Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(text);
+            while (matcher.find()) {
+                String qualification = matcher.group(0).trim();
+                String field = matcher.group(1) != null ? matcher.group(1).trim() : "";
+
+                if (qualification.length() < 100) { // 避免匹配到过长的文本
+                    Education edu = new Education();
+                    edu.setQualification(cleanEducation(qualification));
+                    edu.setField(cleanEducation(field));
+                    edu.setInstitution("University"); // 可以进一步提取
+                    edu.setYear(extractGraduationYear(text));
+                    educationList.add(edu);
+                }
+            }
+        }
+
+        // 如果没有找到教育背景，添加默认值
+        if (educationList.isEmpty()) {
+            Education defaultEdu = new Education();
+            defaultEdu.setQualification("Bachelor Degree");
+            defaultEdu.setField("Construction/Business");
+            defaultEdu.setInstitution("University");
+            defaultEdu.setYear("2018");
+            educationList.add(defaultEdu);
+        }
+
+        // 最多3个教育背景
+        return educationList.subList(0, Math.min(3, educationList.size()));
+    }
+
+    private String cleanEducation(String text) {
+        return text.replaceAll("[^\\w\\s\\-]", "").trim();
+    }
+
+    private String extractGraduationYear(String text) {
+        Pattern yearPattern = Pattern.compile("\\b(19|20)\\d{2}\\b");
+        Matcher matcher = yearPattern.matcher(text);
+
+        List<Integer> years = new ArrayList<>();
+        while (matcher.find()) {
+            try {
+                years.add(Integer.parseInt(matcher.group()));
+            } catch (NumberFormatException e) {
+                // 忽略无效年份
+            }
+        }
+
+        if (!years.isEmpty()) {
+            // 找到最早的年份作为毕业年份
+            Collections.sort(years);
+            return years.get(0).toString();
+        }
+
+        return "2018"; // 默认年份
+    }
+
+    /**
+     * 提取证书和资质
+     *
+     * 策略：
+     * 1. 使用正则表达式匹配常见的建筑行业证书
+     * 2. 根据职位添加相关的默认证书
+     * 3. 标准化证书名称格式
+     * 4. 限制证书数量（最多5个）
+     *
+     * 常见证书类型：
+     * - 项目管理: PMP, PRINCE2, Agile
+     * - 安全证书: White Card, First Aid, Working at Heights
+     * - 专业证书: Construction Management, Contract Management
+     *
+     * @param text 简历文本
+     * @param role 申请职位
+     * @return 证书列表
+     */
+    public List<String> extractCertifications(String text, String role) {
+        String[] certificationPatterns = {
+                "pmp", "project management professional",
+                "white card", "construction induction",
+                "first aid", "cpr",
+                "working at heights", "height safety",
+                "ohs", "whs", "occupational health",
+                "construction management certificate",
+                "contract management certificate",
+                "legal studies", "law degree",
+                "prince2", "agile", "scrum"
+        };
+
+        Set<String> certifications = new LinkedHashSet<>();
+        String lowerText = text.toLowerCase();
+
+        for (String pattern : certificationPatterns) {
+            Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
+            Matcher matcher = p.matcher(text);
+            if (matcher.find()) {
+                String cert = normalizeCertification(matcher.group());
+                if (cert != null && !certifications.contains(cert)) {
+                    certifications.add(cert);
+                }
+            }
+        }
+
+        // 根据角色添加默认证书
+        List<String> defaultCerts;
+        if ("Project Manager".equals(role)) {
+            defaultCerts = Arrays.asList("PMP", "Construction Management Certificate", "White Card");
+        } else {
+            defaultCerts = Arrays.asList("Contract Management Certificate", "Legal Studies", "White Card");
+        }
+
+        for (String cert : defaultCerts) {
+            if (!certifications.contains(cert)) {
+                certifications.add(cert);
+            }
+        }
+
+        // 最多5个证书
+        List<String> result = new ArrayList<>(certifications);
+        return result.subList(0, Math.min(5, result.size()));
+    }
+
+    private String normalizeCertification(String cert) {
+        Map<String, String> certMap = new HashMap<>();
+        certMap.put("pmp", "PMP");
+        certMap.put("project management professional", "PMP");
+        certMap.put("white card", "White Card");
+        certMap.put("construction induction", "White Card");
+        certMap.put("first aid", "First Aid");
+        certMap.put("working at heights", "Working at Heights");
+        certMap.put("height safety", "Working at Heights");
+        certMap.put("ohs", "OHS Certificate");
+        certMap.put("whs", "WHS Certificate");
+        certMap.put("occupational health", "OHS Certificate");
+        certMap.put("construction management certificate", "Construction Management Certificate");
+        certMap.put("contract management certificate", "Contract Management Certificate");
+        certMap.put("legal studies", "Legal Studies");
+        certMap.put("law degree", "Law Degree");
+        certMap.put("prince2", "PRINCE2");
+        certMap.put("agile", "Agile Certification");
+        certMap.put("scrum", "Scrum Master");
+
+        return certMap.getOrDefault(cert.toLowerCase(), cert);
+    }
+
+    /**
+     * 提取工作经历
+     *
+     * 提取策略：
+     * 1. 使用正则表达式匹配公司名称模式
+     * 2. 识别常见的公司后缀（Ltd, Pty, Inc, Corp等）
+     * 3. 为每个公司生成合理的工作经历结构
+     * 4. 如果没有找到公司，创建默认工作经历
+     *
+     * 生成内容：
+     * - 公司名称、职位、工作时间
+     * - 基于角色的职责描述
+     * - 相关项目经验
+     *
+     * @param text 简历文本
+     * @param role 申请职位
+     * @return 工作经历列表
+     */
+    public List<WorkExperience> extractWorkExperience(String text, String role) {
+        List<WorkExperience> experienceList = new ArrayList<>();
+
+        // 尝试提取公司名称和职位
+        String[] companyPatterns = {
+                "(?:at|with|for)\\s+([A-Z][A-Za-z\\s&,.-]+(?:Ltd|Pty|Inc|Corp|Company|Construction|Group|Services))",
+                "([A-Z][A-Za-z\\s&,.-]+(?:Ltd|Pty|Inc|Corp|Company|Construction|Group|Services))"
+        };
+
+        Set<String> companies = new LinkedHashSet<>();
+        for (String pattern : companyPatterns) {
+            Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(text);
+            while (matcher.find()) {
+                String company = matcher.group(1).trim();
+                if (company.length() > 3 && company.length() < 50) {
+                    companies.add(company);
+                }
+            }
+        }
+
+        // 如果找到公司，创建工作经历
+        if (!companies.isEmpty()) {
+            List<String> companyList = new ArrayList<>(companies);
+            // 最多3个公司
+            int limit = Math.min(3, companyList.size());
+
+            for (int i = 0; i < limit; i++) {
+                String company = companyList.get(i);
+                WorkExperience exp = new WorkExperience();
+                exp.setCompany(company);
+                exp.setRole(role);
+                exp.setDuration(generateDuration(i));
+                exp.setResponsibilities(generateResponsibilities(role));
+                exp.setProjects(generateProjects(role));
+                experienceList.add(exp);
+            }
+        } else {
+            // 默认工作经历
+            WorkExperience defaultExp = new WorkExperience();
+            defaultExp.setCompany("Construction Company ABC");
+            defaultExp.setRole(role);
+            defaultExp.setDuration("2020-2024");
+            defaultExp.setResponsibilities(generateResponsibilities(role));
+            defaultExp.setProjects(generateProjects(role));
+            experienceList.add(defaultExp);
+        }
+
+        return experienceList;
+    }
+
+    private String generateDuration(int index) {
+        int currentYear = Year.now().getValue();
+        int startYear = currentYear - (index + 1) * 3;
+        int endYear = currentYear - index * 2;
+        return startYear + "-" + endYear;
+    }
+
+    private List<String> generateResponsibilities(String role) {
+        Map<String, List<String>> responsibilities = new HashMap<>();
+
+        responsibilities.put("Project Manager", Arrays.asList(
+                "Managed construction projects from inception to completion",
+                "Coordinated with multiple stakeholders and contractors",
+                "Ensured projects delivered on time and within budget",
+                "Implemented safety protocols and quality control measures"
+        ));
+
+        responsibilities.put("Contracts Administrator", Arrays.asList(
+                "Managed contract negotiations and administration",
+                "Reviewed and analyzed contract terms and conditions",
+                "Ensured compliance with legal and regulatory requirements",
+                "Coordinated with legal teams and external parties"
+        ));
+
+        return responsibilities.getOrDefault(role, responsibilities.get("Project Manager"));
+    }
+
+    private List<String> generateProjects(String role) {
+        Map<String, List<String>> projects = new HashMap<>();
+
+        projects.put("Project Manager", Arrays.asList(
+                "Commercial building construction",
+                "Infrastructure development",
+                "Residential complex projects"
+        ));
+
+        projects.put("Contracts Administrator", Arrays.asList(
+                "Multi-million dollar contract management",
+                "Vendor agreement negotiations",
+                "Compliance framework implementation"
+        ));
+
+        return projects.getOrDefault(role, projects.get("Project Manager"));
+    }
+
+    /**
+     * 主要提取方法 - 从简历文本中提取所有结构化信息
+     *
+     * 这是类的核心方法，整合所有子提取功能，返回完整的结构化简历数据
+     *
+     * @param text 从PDF/DOCX解析出的原始简历文本
+     * @param role 申请的职位（影响技能和证书的提取）
+     * @return 结构化的简历数据对象
+     */
+    public ResumeData extractResumeData(String text, String role) {
+        ResumeData resumeData = new ResumeData();
+
+        // 个人基本信息
+        PersonalInfo personalInfo = new PersonalInfo();
+        personalInfo.setName(extractName(text));
+        personalInfo.setExperienceYears(extractExperienceYears(text));
+        personalInfo.setCertifications(extractCertifications(text, role));
+        resumeData.setPersonalInfo(personalInfo);
+
+        // 工作经历列表
+        resumeData.setWorkExperience(extractWorkExperience(text, role));
+
+        // 技能列表
+        resumeData.setSkills(extractSkills(text, role));
+
+        // 教育背景列表
+        resumeData.setEducation(extractEducation(text));
+
+        return resumeData;
+    }
+
+
+}
--- a/vetti-common/src/main/java/com/vetti/common/utils/readText/vo/Education.java
+++ b/vetti-common/src/main/java/com/vetti/common/utils/readText/vo/Education.java
@@ -0,0 +1,21 @@
+package com.vetti.common.utils.readText.vo;
+
+import lombok.Data;
+import lombok.experimental.Accessors;
+
+/**
+ * 教育背景
+ *
+ * @author wangxiangshun
+ * @date 2025-11-04
+ */
+@Data
+@Accessors(chain = true)
+public class Education {
+
+    private String qualification;
+    private String field;
+    private String institution;
+    private String year;
+
+}
--- a/vetti-common/src/main/java/com/vetti/common/utils/readText/vo/PersonalInfo.java
+++ b/vetti-common/src/main/java/com/vetti/common/utils/readText/vo/PersonalInfo.java
@@ -0,0 +1,25 @@
+package com.vetti.common.utils.readText.vo;
+
+import lombok.Data;
+import lombok.experimental.Accessors;
+
+import java.util.List;
+
+/**
+ * 个人基本信息
+ *
+ * @author wangxiangshun
+ * @date 2025-11-04
+ */
+@Data
+@Accessors(chain = true)
+public class PersonalInfo {
+
+    private String name;
+
+    private int experienceYears;
+
+    private List<String> certifications;
+
+
+}
--- a/vetti-common/src/main/java/com/vetti/common/utils/readText/vo/ResumeData.java
+++ b/vetti-common/src/main/java/com/vetti/common/utils/readText/vo/ResumeData.java
@@ -0,0 +1,25 @@
+package com.vetti.common.utils.readText.vo;
+
+import lombok.Data;
+import lombok.experimental.Accessors;
+
+import java.util.List;
+
+/**
+ * 简历数据信息
+ *
+ * @author wangxiangshun
+ * @date 2025-11-04
+ */
+@Data
+@Accessors(chain = true)
+public class ResumeData {
+
+    private PersonalInfo personalInfo;
+
+    private List<WorkExperience> workExperience;
+
+    private List<String> skills;
+
+    private List<Education> education;
+}
--- a/vetti-common/src/main/java/com/vetti/common/utils/readText/vo/WorkExperience.java
+++ b/vetti-common/src/main/java/com/vetti/common/utils/readText/vo/WorkExperience.java
@@ -0,0 +1,24 @@
+package com.vetti.common.utils.readText.vo;
+
+import lombok.Data;
+import lombok.experimental.Accessors;
+
+import java.util.List;
+
+/**
+ * 工作经历
+ *
+ * @author wangxiangshun
+ * @date 2025-11-04
+ */
+@Data
+@Accessors(chain = true)
+public class WorkExperience {
+
+    private String company;
+    private String role;
+    private String duration;
+    private List<String> responsibilities;
+    private List<String> projects;
+
+}