From 1bce9c4fa3adaa1bcace0158805eb1e5c350e8b8 Mon Sep 17 00:00:00 2001
From: wangxiangshun <w_wangxiangshun@163.com>
Date: Sat, 18 Oct 2025 23:02:42 +0800
Subject: [PATCH] =?UTF-8?q?STT=E6=B5=81=E5=BC=8F=E8=BE=93=E5=85=A5?=
 =?UTF-8?q?=E4=B8=9A=E5=8A=A1=E9=80=BB=E8=BE=91=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pom.xml                                       |   6 +
 .../vetti/socket/ChatWebSocketHandler.java    | 147 ++++++++++++++--
 vetti-common/pom.xml                          |   9 +
 .../ai/whisper/OpenAIRealtimeClient.java      | 165 ++++++++++++++++++
 .../RealtimeTranscriptionMicrophone.java      | 115 ++++++++++++
 5 files changed, 429 insertions(+), 13 deletions(-)
 create mode 100644 vetti-common/src/main/java/com/vetti/common/ai/whisper/OpenAIRealtimeClient.java
 create mode 100644 vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java
diff --git a/pom.xml b/pom.xml
index 7f87c0a..db1dac9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -304,6 +304,12 @@
                 <version>4.5.14</version>
             </dependency>
 
+            <dependency>
+                <groupId>org.java-websocket</groupId>
+                <artifactId>Java-WebSocket</artifactId>
+                <version>1.5.4</version> <!-- 最新版本可到 Maven 仓库查询 -->
+            </dependency>
+
         </dependencies>
     </dependencyManagement>
 
diff --git a/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java b/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java
index 29c7946..5c135b7 100644
--- a/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java
+++ b/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java
@@ -1,6 +1,7 @@
 package com.vetti.socket;
 
 import cn.hutool.core.util.StrUtil;
+import cn.hutool.json.JSONObject;
 import cn.hutool.json.JSONUtil;
 import com.vetti.common.ai.elevenLabs.ElevenLabsClient;
 import com.vetti.common.ai.gpt.OpenAiStreamClient;
@@ -9,9 +10,15 @@ import com.vetti.common.ai.whisper.WhisperClient;
 import com.vetti.common.config.RuoYiConfig;
 import com.vetti.common.utils.spring.SpringUtils;
 import lombok.extern.slf4j.Slf4j;
+import okhttp3.*;
 import org.apache.commons.io.FileUtils;
+import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Component;
 
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.DataLine;
+import javax.sound.sampled.TargetDataLine;
 import javax.websocket.*;
 import javax.websocket.server.PathParam;
 import javax.websocket.server.ServerEndpoint;
@@ -19,9 +26,11 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.Base64;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CountDownLatch;
 
 /**
  * 语音面试 web处理器
@@ -31,11 +40,41 @@ import java.util.concurrent.ConcurrentHashMap;
 @Component
 public class ChatWebSocketHandler {
 
+    @Value("${whisper.apiUrl}")
+    private String API_URL;
+
+    @Value("${whisper.model}")
+    private String MODEL;
+
+    @Value("${whisper.apiKey}")
+    private String apiKey;
+
+    @Value("${whisper.language}")
+    private String language;
+
+    /**
+     * 16kHz
+     */
+    private static final int SAMPLE_RATE = 16000;
+    /**
+     * 4 KB 每次读取
+     */
+    private static final int BUFFER_SIZE = 4096;
+    /**
+     * 每样本 16 位
+     */
+    private static final int BITS_PER_SAMPLE = 16;
+
     /**
      * 缓存客户端流式解析的语音文本数据
      */
     private final Map<String,String> cacheClientTts = new ConcurrentHashMap<>();
 
+    /**
+     * 缓存客户端调用OpenAi中的websocket-STT 流式传输数据
+     */
+    private final Map<String, WebSocket> cacheWebSocket = new ConcurrentHashMap<>();
+
     // 语音文件保存目录
     private static final String VOICE_STORAGE_DIR = "/voice_files/";
 
@@ -61,6 +100,8 @@ public class ChatWebSocketHandler {
     public void onOpen(Session session, @PathParam("clientId") String clientId) {
         log.info("WebSocket 链接已建立:{}", clientId);
         cacheClientTts.put(clientId,new String());
+        //初始化STT流式语音转换文本的socket链接
+        createWhisperRealtimeSocket(clientId);
     }
 
     // 接收文本消息
@@ -148,20 +189,18 @@ public class ChatWebSocketHandler {
         log.info("3、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
         try{
             //接收到数据流后直接就进行SST处理
-            //拿到文件进行文字转换
-            saveAsWebM(bytes,pathUrl);
-            WhisperClient whisperClient = SpringUtils.getBean(WhisperClient.class);
-            String resultText = whisperClient.handleVoiceToText(pathUrl);
-            log.info("STT:{}",resultText);
-            //进行客户端文本数据存储
-            String cacheString = cacheClientTts.get(clientId);
-            if(StrUtil.isNotEmpty(cacheString)){
-                cacheString = cacheString+resultText;
-            }else {
-                cacheString = resultText;
+            //发送消息
+            WebSocket webSocket = cacheWebSocket.get(clientId);
+            if(webSocket != null){
+                log.info("3.1 开始发送数据音频流啦");
+                // 将音频数据转换为 Base64 编码的字符串
+                String base64Audio = Base64.getEncoder().encodeToString(bytes);
+                String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }";
+                webSocket.send(message);
+                // 3. 提交音频并请求转录
+                webSocket.send("{\"type\": \"input_audio_buffer.commit\"}");
+                webSocket.send("{\"type\": \"response.create\"}");
             }
-            cacheClientTts.put(clientId,cacheString);
-
         }catch (Exception e){
             e.printStackTrace();
         }
@@ -243,5 +282,87 @@ public class ChatWebSocketHandler {
         return null;
     }
 
+    /**
+     * 创建STT WebSocket 客户端链接
+     * @param clientId 客户端ID
+     */
+    private void createWhisperRealtimeSocket(String clientId){
+        try{
+            OkHttpClient client = new OkHttpClient();
+            CountDownLatch latch = new CountDownLatch(1);
+            // 设置 WebSocket 请求
+            Request request = new Request.Builder()
+                    .url(API_URL)
+                    .addHeader("Authorization", "Bearer " + apiKey)
+                    .addHeader("OpenAI-Beta", "realtime=v1")
+                    .build();
+            client.newWebSocket(request, new WebSocketListener() {
+                @Override
+                public void onOpen(WebSocket webSocket, Response response) {
+                    System.out.println("✅ WebSocket 连接成功");
+                    //发送配置
+                    JSONObject config = new JSONObject();
+                    JSONObject sessionConfig = new JSONObject();
+                    JSONObject transcription = new JSONObject();
+                    JSONObject turnDetection = new JSONObject();
+                    // 配置转录参数
+                    transcription.put("model", "gpt-4o-mini-transcribe");
+                    transcription.put("language", language); // 中文
+                    // 配置断句检测
+                    turnDetection.put("type", "server_vad");
+                    turnDetection.put("prefix_padding_ms", 300);
+                    turnDetection.put("silence_duration_ms", 10);
+                    // 组装完整配置
+                    sessionConfig.put("input_audio_transcription", transcription);
+                    sessionConfig.put("turn_detection", turnDetection);
+                    config.put("type", "transcription_session.update");
+                    config.put("session", sessionConfig);
+                    webSocket.send(config.toString());
+                    // 1. 启动音频缓冲
+                    webSocket.send("{\"type\": \"input_audio_buffer.start\"}");
+                    //存储客户端webSocket对象,对数据进行隔离处理
+                    cacheWebSocket.put(clientId,webSocket);
+                }
+
+                @Override
+                public void onMessage(WebSocket webSocket, String text) {
+                    System.out.println("📩 收到转录结果: " + text);
+                    //对数据进行解析
+                    if(StrUtil.isNotEmpty(text)){
+                        Map<String,String> mapResultData = JSONUtil.toBean(text,Map.class);
+                        if("conversation.item.input_audio_transcription.delta".equals(mapResultData.get("type"))){
+                            String resultText = mapResultData.get("delta");
+                            //进行客户端文本数据存储
+                            String cacheString = cacheClientTts.get(clientId);
+                            if(StrUtil.isNotEmpty(cacheString)){
+                                cacheString = cacheString+resultText;
+                            }else {
+                                cacheString = resultText;
+                            }
+                            cacheClientTts.put(clientId,cacheString);
+                        }
+                    }
+                }
+
+                @Override
+                public void onFailure(WebSocket webSocket, Throwable t, Response response) {
+                    System.err.println("❌ 连接失败: " + t.getMessage());
+                    latch.countDown();
+                }
+
+                @Override
+                public void onClosing(WebSocket webSocket, int code, String reason) {
+                    System.out.println("⚠️ 连接即将关闭: " + reason);
+                    webSocket.close(1000, null);
+                    latch.countDown();
+                }
+            });
+            // 等待 WebSocket 关闭
+            latch.await();
+        }catch (Exception e){
+            e.printStackTrace();
+        }
+    }
+
 }
 
diff --git a/vetti-common/pom.xml b/vetti-common/pom.xml
index d9c1369..9da9e72 100644
--- a/vetti-common/pom.xml
+++ b/vetti-common/pom.xml
@@ -165,6 +165,15 @@
             <artifactId>httpmime</artifactId>
         </dependency>
 
+        <dependency>
+            <groupId>org.java-websocket</groupId>
+            <artifactId>Java-WebSocket</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tomcat.embed</groupId>
+            <artifactId>tomcat-embed-websocket</artifactId>
+        </dependency>
+
 
     </dependencies>
 
diff --git a/vetti-common/src/main/java/com/vetti/common/ai/whisper/OpenAIRealtimeClient.java b/vetti-common/src/main/java/com/vetti/common/ai/whisper/OpenAIRealtimeClient.java
new file mode 100644
index 0000000..4732cf3
--- /dev/null
+++ b/vetti-common/src/main/java/com/vetti/common/ai/whisper/OpenAIRealtimeClient.java
@@ -0,0 +1,165 @@
+package com.vetti.common.ai.whisper;
+
+import cn.hutool.json.JSONObject;
+import lombok.extern.slf4j.Slf4j;
+import org.java_websocket.client.WebSocketClient;
+import org.java_websocket.handshake.ServerHandshake;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.ByteBuffer;
+import java.util.Base64;
+import java.util.HashMap;
+import java.util.Map;
+
+@Slf4j
+public class OpenAIRealtimeClient extends WebSocketClient {
+
+    // 构造方法：初始化连接地址和请求头（携带认证信息）
+    public OpenAIRealtimeClient(String apiKey) throws URISyntaxException {
+        super(
+                new URI("wss://api.openai.com/v1/realtime?intent=transcription"),
+                buildHeaders(apiKey) // 构建请求头
+        );
+    }
+
+    // 构建请求头（携带认证和内容类型）
+    private static Map<String, String> buildHeaders(String apiKey) {
+        Map<String, String> headers = new HashMap<>();
+        headers.put("Authorization", "Bearer " + apiKey); // 核心认证头
+        headers.put("Content-Type", "application/json"); // 根据接口要求调整
+        headers.put("OpenAI-Beta", "realtime=v1"); // 若接口要求 beta 版本标识
+        return headers;
+    }
+
+    // 连接成功回调
+    @Override
+    public void onOpen(ServerHandshake handshakedata) {
+        System.out.println("WebSocket 连接已打开，状态码：" + handshakedata.getHttpStatus());
+        // 连接成功后可发送初始化消息（如配置转录参数）
+        sendInitMessage();
+    }
+
+    // 接收服务器消息回调（处理转录结果）
+    @Override
+    public void onMessage(String message) {
+        System.out.println("收到转录文本：" + message);
+        // 解析 JSON 格式的转录结果（可使用 Jackson/Gson 等库）
+    }
+
+    // 接收二进制消息（若服务器返回二进制数据，如音频片段确认）
+    @Override
+    public void onMessage(ByteBuffer bytes) {
+        System.out.println("收到二进制数据，长度：" + bytes.remaining());
+        // 处理二进制消息（如需）
+    }
+
+    // 连接关闭回调
+    @Override
+    public void onClose(int code, String reason, boolean remote) {
+        System.out.println("连接关闭，状态码：" + code + "，原因：" + reason);
+    }
+
+    // 连接错误回调
+    @Override
+    public void onError(Exception ex) {
+        System.err.println("连接错误：" + ex.getMessage());
+        ex.printStackTrace();
+    }
+
+    // 发送初始化消息（根据 OpenAI 接口要求配置转录参数）
+    private void sendInitMessage() {
+        JSONObject config = new JSONObject();
+        JSONObject sessionConfig = new JSONObject();
+        JSONObject transcription = new JSONObject();
+        JSONObject turnDetection = new JSONObject();
+
+        // 配置转录参数
+        transcription.put("model", "gpt-4o-mini-transcribe");
+        transcription.put("language", "zh"); // 中文
+
+        // 配置断句检测
+        turnDetection.put("type", "server_vad");
+        turnDetection.put("prefix_padding_ms", 300);
+        turnDetection.put("silence_duration_ms", 10);
+
+        // 组装完整配置
+        sessionConfig.put("input_audio_transcription", transcription);
+        sessionConfig.put("turn_detection", turnDetection);
+        config.put("type", "transcription_session.update");
+        config.put("session", sessionConfig);
+
+        this.send(config.toString());
+        System.out.println("已发送初始化配置");
+    }
+
+    // 发送音频数据（核心：将麦克风/文件的音频流发送到服务器）
+    public void sendAudioData(byte[] audioBytes) {
+        if (this.isOpen()) {
+            // 按接口要求封装音频数据（通常为 JSON 包裹二进制，或直接发送二进制）
+            // OpenAI要求语音数据以Base64编码发送
+            String base64Chunk = Base64.getEncoder().encodeToString(audioBytes);
+            String audioJson = "{\n" +
+                    "  \"type\": \"input_audio_buffer.append\",\n" +
+                    "  \"audio\": \""+base64Chunk+"\"\n" +
+                    "}";
+            this.send(audioJson);
+//            this.send(audioBytes);
+            System.out.println("已发送音频数据，长度：" + audioBytes.length);
+        } else {
+            System.err.println("连接未打开，无法发送音频");
+        }
+
+    }
+
+    public void commitData() {
+        String base64Chunk = Base64.getEncoder().encodeToString(new byte[0]);
+        String audioJson = "{\n" +
+                "  \"type\": \"input_audio_buffer.append\",\n" +
+                "  \"audio\": \""+base64Chunk+"\"\n" +
+                "}";
+        this.send(audioJson);
+    }
+
+    public static void main(String[] args) {
+        String apiKey = "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"; // 替换为你的 OpenAI API Key
+
+        try {
+            // 创建客户端
+            OpenAIRealtimeClient client = new OpenAIRealtimeClient(apiKey);
+            // 连接服务器
+            client.connectBlocking(); // 阻塞式连接（也可使用非阻塞 connect()）
+
+            // 模拟发送音频数据（实际应从麦克风或文件读取）
+            // 注意：音频格式需符合接口要求（通常为 PCM 16kHz 单声道等）
+            // 读取本地PCM文件（16kHz单声道16位）并分片发送
+            try (java.io.FileInputStream fis = new java.io.FileInputStream("/Users/wangxiangshun/Desktop/临时文件/output1112.mp3")) {
+                byte[] buffer = new byte[6400]; // 200ms的PCM数据（16000Hz*16位*1声道=32000字节/秒 → 6400字节/200ms）
+                int len;
+                while ((len = fis.read(buffer)) != -1) {
+                    byte[] chunk = new byte[len];
+                    System.arraycopy(buffer, 0, chunk, 0, len);
+                    client.sendAudioData(chunk);
+                    Thread.sleep(200); // 模拟实时流（每200ms发送一次）
+                }
+            }
+
+            //发送一个空的二进制流
+            // 4. 发送结束标记（空二进制消息）
+            client.sendAudioData(new byte[600]);
+
+            // 等待转录完成（实际场景需根据业务逻辑控制）
+            Thread.sleep(20000);
+
+            // 关闭连接
+            client.close();
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+    // 模拟读取音频数据（实际需用音频库采集，如 Java Sound API 或 VLCJ）
+    private static byte[] readAudioFromSource() {
+        // 示例：返回空字节数组（实际应填充真实音频数据）
+        return new byte[1024];
+    }
+}
diff --git a/vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java b/vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java
new file mode 100644
index 0000000..619b96d
--- /dev/null
+++ b/vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java
@@ -0,0 +1,115 @@
+package com.vetti.common.ai.whisper;
+
+import cn.hutool.json.JSONObject;
+import okhttp3.*;
+
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.DataLine;
+import javax.sound.sampled.TargetDataLine;
+import java.util.Base64;
+import java.util.concurrent.CountDownLatch;
+
+public class RealtimeTranscriptionMicrophone {
+
+    private static final String API_KEY = "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";
+    private static final String URL = "wss://api.openai.com/v1/realtime?intent=transcription";
+    private static final int SAMPLE_RATE = 16000; // 16kHz
+    private static final int BUFFER_SIZE = 4096; // 4 KB 每次读取
+    private static final int BITS_PER_SAMPLE = 16; // 每样本 16 位
+
+    public static void main(String[] args) throws Exception {
+
+        OkHttpClient client = new OkHttpClient();
+        CountDownLatch latch = new CountDownLatch(1);
+
+        // 设置 WebSocket 请求
+        Request request = new Request.Builder()
+                .url(URL)
+                .addHeader("Authorization", "Bearer " + API_KEY)
+                .addHeader("OpenAI-Beta", "realtime=v1")
+                .build();
+
+        WebSocket ws = client.newWebSocket(request, new WebSocketListener() {
+            @Override
+            public void onOpen(WebSocket webSocket, Response response) {
+                System.out.println("✅ WebSocket 连接成功");
+
+                //发送配置
+                JSONObject config = new JSONObject();
+                JSONObject sessionConfig = new JSONObject();
+                JSONObject transcription = new JSONObject();
+                JSONObject turnDetection = new JSONObject();
+
+                // 配置转录参数
+                transcription.put("model", "gpt-4o-mini-transcribe");
+                transcription.put("language", "zh"); // 中文
+                // 配置断句检测
+                turnDetection.put("type", "server_vad");
+                turnDetection.put("prefix_padding_ms", 300);
+                turnDetection.put("silence_duration_ms", 10);
+                // 组装完整配置
+                sessionConfig.put("input_audio_transcription", transcription);
+                sessionConfig.put("turn_detection", turnDetection);
+                config.put("type", "transcription_session.update");
+                config.put("session", sessionConfig);
+
+                webSocket.send(config.toString());
+
+                // 1. 启动音频缓冲
+                webSocket.send("{\"type\": \"input_audio_buffer.start\"}");
+
+                // 2. 开始录音并实时发送
+                new Thread(() -> {
+                    try {
+                        // 设置麦克风输入流
+                        AudioFormat format = new AudioFormat(SAMPLE_RATE, BITS_PER_SAMPLE, 1, true, false);
+                        DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
+                        TargetDataLine line = (TargetDataLine) AudioSystem.getLine(info);
+                        line.open(format);
+                        line.start();
+
+                        byte[] buffer = new byte[BUFFER_SIZE];
+                        int bytesRead;
+                        while ((bytesRead = line.read(buffer, 0, buffer.length)) > 0) {
+                            // 将音频数据转换为 Base64 编码的字符串
+                            byte[] audioData = new byte[bytesRead];
+                            System.arraycopy(buffer, 0, audioData, 0, bytesRead);
+                            String base64Audio = Base64.getEncoder().encodeToString(audioData);
+                            String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }";
+                            webSocket.send(message);
+                        }
+
+                        // 3. 提交音频并请求转录
+                        webSocket.send("{\"type\": \"input_audio_buffer.commit\"}");
+                        webSocket.send("{\"type\": \"response.create\"}");
+                    } catch (Exception e) {
+                        e.printStackTrace();
+                    }
+                }).start();
+            }
+
+            @Override
+            public void onMessage(WebSocket webSocket, String text) {
+                System.out.println("📩 收到转录结果: " + text);
+            }
+
+            @Override
+            public void onFailure(WebSocket webSocket, Throwable t, Response response) {
+                System.err.println("❌ 连接失败: " + t.getMessage());
+                latch.countDown();
+            }
+
+            @Override
+            public void onClosing(WebSocket webSocket, int code, String reason) {
+                System.out.println("⚠️ 连接即将关闭: " + reason);
+                webSocket.close(1000, null);
+                latch.countDown();
+            }
+        });
+
+        // 等待 WebSocket 关闭
+        latch.await();
+    }
+}
+