diff --git a/pom.xml b/pom.xml
index 7f87c0a..db1dac9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -304,6 +304,12 @@
4.5.14
+
+ org.java-websocket
+ Java-WebSocket
+ 1.5.4
+
+
diff --git a/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java b/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java
index 29c7946..5c135b7 100644
--- a/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java
+++ b/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java
@@ -1,6 +1,7 @@
package com.vetti.socket;
import cn.hutool.core.util.StrUtil;
+import cn.hutool.json.JSONObject;
import cn.hutool.json.JSONUtil;
import com.vetti.common.ai.elevenLabs.ElevenLabsClient;
import com.vetti.common.ai.gpt.OpenAiStreamClient;
@@ -9,9 +10,15 @@ import com.vetti.common.ai.whisper.WhisperClient;
import com.vetti.common.config.RuoYiConfig;
import com.vetti.common.utils.spring.SpringUtils;
import lombok.extern.slf4j.Slf4j;
+import okhttp3.*;
import org.apache.commons.io.FileUtils;
+import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.DataLine;
+import javax.sound.sampled.TargetDataLine;
import javax.websocket.*;
import javax.websocket.server.PathParam;
import javax.websocket.server.ServerEndpoint;
@@ -19,9 +26,11 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
+import java.util.Base64;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CountDownLatch;
/**
* 语音面试 web处理器
@@ -31,11 +40,41 @@ import java.util.concurrent.ConcurrentHashMap;
@Component
public class ChatWebSocketHandler {
+ @Value("${whisper.apiUrl}")
+ private String API_URL;
+
+ @Value("${whisper.model}")
+ private String MODEL;
+
+ @Value("${whisper.apiKey}")
+ private String apiKey;
+
+ @Value("${whisper.language}")
+ private String language;
+
+ /**
+ * 16kHz
+ */
+ private static final int SAMPLE_RATE = 16000;
+ /**
+ * 4 KB 每次读取
+ */
+ private static final int BUFFER_SIZE = 4096;
+ /**
+ * 每样本 16 位
+ */
+ private static final int BITS_PER_SAMPLE = 16;
+
/**
* 缓存客户端流式解析的语音文本数据
*/
private final Map cacheClientTts = new ConcurrentHashMap<>();
+ /**
+ * 缓存客户端调用OpenAi中的websocket-STT 流式传输数据
+ */
+ private final Map cacheWebSocket = new ConcurrentHashMap<>();
+
// 语音文件保存目录
private static final String VOICE_STORAGE_DIR = "/voice_files/";
@@ -61,6 +100,8 @@ public class ChatWebSocketHandler {
public void onOpen(Session session, @PathParam("clientId") String clientId) {
log.info("WebSocket 链接已建立:{}", clientId);
cacheClientTts.put(clientId,new String());
+ //初始化STT流式语音转换文本的socket链接
+ createWhisperRealtimeSocket(clientId);
}
// 接收文本消息
@@ -148,20 +189,18 @@ public class ChatWebSocketHandler {
log.info("3、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
try{
//接收到数据流后直接就进行SST处理
- //拿到文件进行文字转换
- saveAsWebM(bytes,pathUrl);
- WhisperClient whisperClient = SpringUtils.getBean(WhisperClient.class);
- String resultText = whisperClient.handleVoiceToText(pathUrl);
- log.info("STT:{}",resultText);
- //进行客户端文本数据存储
- String cacheString = cacheClientTts.get(clientId);
- if(StrUtil.isNotEmpty(cacheString)){
- cacheString = cacheString+resultText;
- }else {
- cacheString = resultText;
+ //发送消息
+ WebSocket webSocket = cacheWebSocket.get(clientId);
+ if(webSocket != null){
+ log.info("3.1 开始发送数据音频流啦");
+ // 将音频数据转换为 Base64 编码的字符串
+ String base64Audio = Base64.getEncoder().encodeToString(bytes);
+ String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }";
+ webSocket.send(message);
+ // 3. 提交音频并请求转录
+ webSocket.send("{\"type\": \"input_audio_buffer.commit\"}");
+ webSocket.send("{\"type\": \"response.create\"}");
}
- cacheClientTts.put(clientId,cacheString);
-
}catch (Exception e){
e.printStackTrace();
}
@@ -243,5 +282,87 @@ public class ChatWebSocketHandler {
return null;
}
+ /**
+ * 创建STT WebSocket 客户端链接
+ * @param clientId 客户端ID
+ */
+ private void createWhisperRealtimeSocket(String clientId){
+ try{
+ OkHttpClient client = new OkHttpClient();
+ CountDownLatch latch = new CountDownLatch(1);
+ // 设置 WebSocket 请求
+ Request request = new Request.Builder()
+ .url(API_URL)
+ .addHeader("Authorization", "Bearer " + apiKey)
+ .addHeader("OpenAI-Beta", "realtime=v1")
+ .build();
+ client.newWebSocket(request, new WebSocketListener() {
+ @Override
+ public void onOpen(WebSocket webSocket, Response response) {
+ System.out.println("✅ WebSocket 连接成功");
+ //发送配置
+ JSONObject config = new JSONObject();
+ JSONObject sessionConfig = new JSONObject();
+ JSONObject transcription = new JSONObject();
+ JSONObject turnDetection = new JSONObject();
+ // 配置转录参数
+ transcription.put("model", "gpt-4o-mini-transcribe");
+ transcription.put("language", language); // 中文
+ // 配置断句检测
+ turnDetection.put("type", "server_vad");
+ turnDetection.put("prefix_padding_ms", 300);
+ turnDetection.put("silence_duration_ms", 10);
+ // 组装完整配置
+ sessionConfig.put("input_audio_transcription", transcription);
+ sessionConfig.put("turn_detection", turnDetection);
+ config.put("type", "transcription_session.update");
+ config.put("session", sessionConfig);
+ webSocket.send(config.toString());
+ // 1. 启动音频缓冲
+ webSocket.send("{\"type\": \"input_audio_buffer.start\"}");
+ //存储客户端webSocket对象,对数据进行隔离处理
+ cacheWebSocket.put(clientId,webSocket);
+ }
+
+ @Override
+ public void onMessage(WebSocket webSocket, String text) {
+ System.out.println("📩 收到转录结果: " + text);
+ //对数据进行解析
+ if(StrUtil.isNotEmpty(text)){
+ Map mapResultData = JSONUtil.toBean(text,Map.class);
+ if("conversation.item.input_audio_transcription.delta".equals(mapResultData.get("type"))){
+ String resultText = mapResultData.get("delta");
+ //进行客户端文本数据存储
+ String cacheString = cacheClientTts.get(clientId);
+ if(StrUtil.isNotEmpty(cacheString)){
+ cacheString = cacheString+resultText;
+ }else {
+ cacheString = resultText;
+ }
+ cacheClientTts.put(clientId,cacheString);
+ }
+ }
+ }
+
+ @Override
+ public void onFailure(WebSocket webSocket, Throwable t, Response response) {
+ System.err.println("❌ 连接失败: " + t.getMessage());
+ latch.countDown();
+ }
+
+ @Override
+ public void onClosing(WebSocket webSocket, int code, String reason) {
+ System.out.println("⚠️ 连接即将关闭: " + reason);
+ webSocket.close(1000, null);
+ latch.countDown();
+ }
+ });
+ // 等待 WebSocket 关闭
+ latch.await();
+ }catch (Exception e){
+ e.printStackTrace();
+ }
+ }
+
}
diff --git a/vetti-common/pom.xml b/vetti-common/pom.xml
index d9c1369..9da9e72 100644
--- a/vetti-common/pom.xml
+++ b/vetti-common/pom.xml
@@ -165,6 +165,15 @@
httpmime
+
+ org.java-websocket
+ Java-WebSocket
+
+
+ org.apache.tomcat.embed
+ tomcat-embed-websocket
+
+
diff --git a/vetti-common/src/main/java/com/vetti/common/ai/whisper/OpenAIRealtimeClient.java b/vetti-common/src/main/java/com/vetti/common/ai/whisper/OpenAIRealtimeClient.java
new file mode 100644
index 0000000..4732cf3
--- /dev/null
+++ b/vetti-common/src/main/java/com/vetti/common/ai/whisper/OpenAIRealtimeClient.java
@@ -0,0 +1,165 @@
+package com.vetti.common.ai.whisper;
+
+import cn.hutool.json.JSONObject;
+import lombok.extern.slf4j.Slf4j;
+import org.java_websocket.client.WebSocketClient;
+import org.java_websocket.handshake.ServerHandshake;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.ByteBuffer;
+import java.util.Base64;
+import java.util.HashMap;
+import java.util.Map;
+
+@Slf4j
+public class OpenAIRealtimeClient extends WebSocketClient {
+
+ // 构造方法:初始化连接地址和请求头(携带认证信息)
+ public OpenAIRealtimeClient(String apiKey) throws URISyntaxException {
+ super(
+ new URI("wss://api.openai.com/v1/realtime?intent=transcription"),
+ buildHeaders(apiKey) // 构建请求头
+ );
+ }
+
+ // 构建请求头(携带认证和内容类型)
+ private static Map buildHeaders(String apiKey) {
+ Map headers = new HashMap<>();
+ headers.put("Authorization", "Bearer " + apiKey); // 核心认证头
+ headers.put("Content-Type", "application/json"); // 根据接口要求调整
+ headers.put("OpenAI-Beta", "realtime=v1"); // 若接口要求 beta 版本标识
+ return headers;
+ }
+
+ // 连接成功回调
+ @Override
+ public void onOpen(ServerHandshake handshakedata) {
+ System.out.println("WebSocket 连接已打开,状态码:" + handshakedata.getHttpStatus());
+ // 连接成功后可发送初始化消息(如配置转录参数)
+ sendInitMessage();
+ }
+
+ // 接收服务器消息回调(处理转录结果)
+ @Override
+ public void onMessage(String message) {
+ System.out.println("收到转录文本:" + message);
+ // 解析 JSON 格式的转录结果(可使用 Jackson/Gson 等库)
+ }
+
+ // 接收二进制消息(若服务器返回二进制数据,如音频片段确认)
+ @Override
+ public void onMessage(ByteBuffer bytes) {
+ System.out.println("收到二进制数据,长度:" + bytes.remaining());
+ // 处理二进制消息(如需)
+ }
+
+ // 连接关闭回调
+ @Override
+ public void onClose(int code, String reason, boolean remote) {
+ System.out.println("连接关闭,状态码:" + code + ",原因:" + reason);
+ }
+
+ // 连接错误回调
+ @Override
+ public void onError(Exception ex) {
+ System.err.println("连接错误:" + ex.getMessage());
+ ex.printStackTrace();
+ }
+
+ // 发送初始化消息(根据 OpenAI 接口要求配置转录参数)
+ private void sendInitMessage() {
+ JSONObject config = new JSONObject();
+ JSONObject sessionConfig = new JSONObject();
+ JSONObject transcription = new JSONObject();
+ JSONObject turnDetection = new JSONObject();
+
+ // 配置转录参数
+ transcription.put("model", "gpt-4o-mini-transcribe");
+ transcription.put("language", "zh"); // 中文
+
+ // 配置断句检测
+ turnDetection.put("type", "server_vad");
+ turnDetection.put("prefix_padding_ms", 300);
+ turnDetection.put("silence_duration_ms", 10);
+
+ // 组装完整配置
+ sessionConfig.put("input_audio_transcription", transcription);
+ sessionConfig.put("turn_detection", turnDetection);
+ config.put("type", "transcription_session.update");
+ config.put("session", sessionConfig);
+
+ this.send(config.toString());
+ System.out.println("已发送初始化配置");
+ }
+
+ // 发送音频数据(核心:将麦克风/文件的音频流发送到服务器)
+ public void sendAudioData(byte[] audioBytes) {
+ if (this.isOpen()) {
+ // 按接口要求封装音频数据(通常为 JSON 包裹二进制,或直接发送二进制)
+ // OpenAI要求语音数据以Base64编码发送
+ String base64Chunk = Base64.getEncoder().encodeToString(audioBytes);
+ String audioJson = "{\n" +
+ " \"type\": \"input_audio_buffer.append\",\n" +
+ " \"audio\": \""+base64Chunk+"\"\n" +
+ "}";
+ this.send(audioJson);
+// this.send(audioBytes);
+ System.out.println("已发送音频数据,长度:" + audioBytes.length);
+ } else {
+ System.err.println("连接未打开,无法发送音频");
+ }
+
+ }
+
+ public void commitData() {
+ String base64Chunk = Base64.getEncoder().encodeToString(new byte[0]);
+ String audioJson = "{\n" +
+ " \"type\": \"input_audio_buffer.append\",\n" +
+ " \"audio\": \""+base64Chunk+"\"\n" +
+ "}";
+ this.send(audioJson);
+ }
+
+ public static void main(String[] args) {
+ String apiKey = "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"; // 替换为你的 OpenAI API Key
+
+ try {
+ // 创建客户端
+ OpenAIRealtimeClient client = new OpenAIRealtimeClient(apiKey);
+ // 连接服务器
+ client.connectBlocking(); // 阻塞式连接(也可使用非阻塞 connect())
+
+ // 模拟发送音频数据(实际应从麦克风或文件读取)
+ // 注意:音频格式需符合接口要求(通常为 PCM 16kHz 单声道等)
+ // 读取本地PCM文件(16kHz单声道16位)并分片发送
+ try (java.io.FileInputStream fis = new java.io.FileInputStream("/Users/wangxiangshun/Desktop/临时文件/output1112.mp3")) {
+ byte[] buffer = new byte[6400]; // 200ms的PCM数据(16000Hz*16位*1声道=32000字节/秒 → 6400字节/200ms)
+ int len;
+ while ((len = fis.read(buffer)) != -1) {
+ byte[] chunk = new byte[len];
+ System.arraycopy(buffer, 0, chunk, 0, len);
+ client.sendAudioData(chunk);
+ Thread.sleep(200); // 模拟实时流(每200ms发送一次)
+ }
+ }
+
+ //发送一个空的二进制流
+ // 4. 发送结束标记(空二进制消息)
+ client.sendAudioData(new byte[600]);
+
+ // 等待转录完成(实际场景需根据业务逻辑控制)
+ Thread.sleep(20000);
+
+ // 关闭连接
+ client.close();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ // 模拟读取音频数据(实际需用音频库采集,如 Java Sound API 或 VLCJ)
+ private static byte[] readAudioFromSource() {
+ // 示例:返回空字节数组(实际应填充真实音频数据)
+ return new byte[1024];
+ }
+}
diff --git a/vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java b/vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java
new file mode 100644
index 0000000..619b96d
--- /dev/null
+++ b/vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java
@@ -0,0 +1,115 @@
+package com.vetti.common.ai.whisper;
+
+import cn.hutool.json.JSONObject;
+import okhttp3.*;
+
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.DataLine;
+import javax.sound.sampled.TargetDataLine;
+import java.util.Base64;
+import java.util.concurrent.CountDownLatch;
+
+public class RealtimeTranscriptionMicrophone {
+
+ private static final String API_KEY = "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";
+ private static final String URL = "wss://api.openai.com/v1/realtime?intent=transcription";
+ private static final int SAMPLE_RATE = 16000; // 16kHz
+ private static final int BUFFER_SIZE = 4096; // 4 KB 每次读取
+ private static final int BITS_PER_SAMPLE = 16; // 每样本 16 位
+
+ public static void main(String[] args) throws Exception {
+
+ OkHttpClient client = new OkHttpClient();
+ CountDownLatch latch = new CountDownLatch(1);
+
+ // 设置 WebSocket 请求
+ Request request = new Request.Builder()
+ .url(URL)
+ .addHeader("Authorization", "Bearer " + API_KEY)
+ .addHeader("OpenAI-Beta", "realtime=v1")
+ .build();
+
+ WebSocket ws = client.newWebSocket(request, new WebSocketListener() {
+ @Override
+ public void onOpen(WebSocket webSocket, Response response) {
+ System.out.println("✅ WebSocket 连接成功");
+
+ //发送配置
+ JSONObject config = new JSONObject();
+ JSONObject sessionConfig = new JSONObject();
+ JSONObject transcription = new JSONObject();
+ JSONObject turnDetection = new JSONObject();
+
+ // 配置转录参数
+ transcription.put("model", "gpt-4o-mini-transcribe");
+ transcription.put("language", "zh"); // 中文
+ // 配置断句检测
+ turnDetection.put("type", "server_vad");
+ turnDetection.put("prefix_padding_ms", 300);
+ turnDetection.put("silence_duration_ms", 10);
+ // 组装完整配置
+ sessionConfig.put("input_audio_transcription", transcription);
+ sessionConfig.put("turn_detection", turnDetection);
+ config.put("type", "transcription_session.update");
+ config.put("session", sessionConfig);
+
+ webSocket.send(config.toString());
+
+ // 1. 启动音频缓冲
+ webSocket.send("{\"type\": \"input_audio_buffer.start\"}");
+
+ // 2. 开始录音并实时发送
+ new Thread(() -> {
+ try {
+ // 设置麦克风输入流
+ AudioFormat format = new AudioFormat(SAMPLE_RATE, BITS_PER_SAMPLE, 1, true, false);
+ DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
+ TargetDataLine line = (TargetDataLine) AudioSystem.getLine(info);
+ line.open(format);
+ line.start();
+
+ byte[] buffer = new byte[BUFFER_SIZE];
+ int bytesRead;
+ while ((bytesRead = line.read(buffer, 0, buffer.length)) > 0) {
+ // 将音频数据转换为 Base64 编码的字符串
+ byte[] audioData = new byte[bytesRead];
+ System.arraycopy(buffer, 0, audioData, 0, bytesRead);
+ String base64Audio = Base64.getEncoder().encodeToString(audioData);
+ String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }";
+ webSocket.send(message);
+ }
+
+ // 3. 提交音频并请求转录
+ webSocket.send("{\"type\": \"input_audio_buffer.commit\"}");
+ webSocket.send("{\"type\": \"response.create\"}");
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }).start();
+ }
+
+ @Override
+ public void onMessage(WebSocket webSocket, String text) {
+ System.out.println("📩 收到转录结果: " + text);
+ }
+
+ @Override
+ public void onFailure(WebSocket webSocket, Throwable t, Response response) {
+ System.err.println("❌ 连接失败: " + t.getMessage());
+ latch.countDown();
+ }
+
+ @Override
+ public void onClosing(WebSocket webSocket, int code, String reason) {
+ System.out.println("⚠️ 连接即将关闭: " + reason);
+ webSocket.close(1000, null);
+ latch.countDown();
+ }
+ });
+
+ // 等待 WebSocket 关闭
+ latch.await();
+ }
+}
+