From 1bce9c4fa3adaa1bcace0158805eb1e5c350e8b8 Mon Sep 17 00:00:00 2001 From: wangxiangshun Date: Sat, 18 Oct 2025 23:02:42 +0800 Subject: [PATCH] =?UTF-8?q?STT=E6=B5=81=E5=BC=8F=E8=BE=93=E5=85=A5?= =?UTF-8?q?=E4=B8=9A=E5=8A=A1=E9=80=BB=E8=BE=91=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 6 + .../vetti/socket/ChatWebSocketHandler.java | 147 ++++++++++++++-- vetti-common/pom.xml | 9 + .../ai/whisper/OpenAIRealtimeClient.java | 165 ++++++++++++++++++ .../RealtimeTranscriptionMicrophone.java | 115 ++++++++++++ 5 files changed, 429 insertions(+), 13 deletions(-) create mode 100644 vetti-common/src/main/java/com/vetti/common/ai/whisper/OpenAIRealtimeClient.java create mode 100644 vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java diff --git a/pom.xml b/pom.xml index 7f87c0a..db1dac9 100644 --- a/pom.xml +++ b/pom.xml @@ -304,6 +304,12 @@ 4.5.14 + + org.java-websocket + Java-WebSocket + 1.5.4 + + diff --git a/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java b/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java index 29c7946..5c135b7 100644 --- a/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java +++ b/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java @@ -1,6 +1,7 @@ package com.vetti.socket; import cn.hutool.core.util.StrUtil; +import cn.hutool.json.JSONObject; import cn.hutool.json.JSONUtil; import com.vetti.common.ai.elevenLabs.ElevenLabsClient; import com.vetti.common.ai.gpt.OpenAiStreamClient; @@ -9,9 +10,15 @@ import com.vetti.common.ai.whisper.WhisperClient; import com.vetti.common.config.RuoYiConfig; import com.vetti.common.utils.spring.SpringUtils; import lombok.extern.slf4j.Slf4j; +import okhttp3.*; import org.apache.commons.io.FileUtils; +import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Component; +import javax.sound.sampled.AudioFormat; +import javax.sound.sampled.AudioSystem; +import javax.sound.sampled.DataLine; +import javax.sound.sampled.TargetDataLine; import javax.websocket.*; import javax.websocket.server.PathParam; import javax.websocket.server.ServerEndpoint; @@ -19,9 +26,11 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.Base64; import java.util.HashMap; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CountDownLatch; /** * 语音面试 web处理器 @@ -31,11 +40,41 @@ import java.util.concurrent.ConcurrentHashMap; @Component public class ChatWebSocketHandler { + @Value("${whisper.apiUrl}") + private String API_URL; + + @Value("${whisper.model}") + private String MODEL; + + @Value("${whisper.apiKey}") + private String apiKey; + + @Value("${whisper.language}") + private String language; + + /** + * 16kHz + */ + private static final int SAMPLE_RATE = 16000; + /** + * 4 KB 每次读取 + */ + private static final int BUFFER_SIZE = 4096; + /** + * 每样本 16 位 + */ + private static final int BITS_PER_SAMPLE = 16; + /** * 缓存客户端流式解析的语音文本数据 */ private final Map cacheClientTts = new ConcurrentHashMap<>(); + /** + * 缓存客户端调用OpenAi中的websocket-STT 流式传输数据 + */ + private final Map cacheWebSocket = new ConcurrentHashMap<>(); + // 语音文件保存目录 private static final String VOICE_STORAGE_DIR = "/voice_files/"; @@ -61,6 +100,8 @@ public class ChatWebSocketHandler { public void onOpen(Session session, @PathParam("clientId") String clientId) { log.info("WebSocket 链接已建立:{}", clientId); cacheClientTts.put(clientId,new String()); + //初始化STT流式语音转换文本的socket链接 + createWhisperRealtimeSocket(clientId); } // 接收文本消息 @@ -148,20 +189,18 @@ public class ChatWebSocketHandler { log.info("3、开始接收数据流时间:{}",System.currentTimeMillis()/1000); try{ //接收到数据流后直接就进行SST处理 - //拿到文件进行文字转换 - saveAsWebM(bytes,pathUrl); - WhisperClient whisperClient = SpringUtils.getBean(WhisperClient.class); - String resultText = whisperClient.handleVoiceToText(pathUrl); - log.info("STT:{}",resultText); - //进行客户端文本数据存储 - String cacheString = cacheClientTts.get(clientId); - if(StrUtil.isNotEmpty(cacheString)){ - cacheString = cacheString+resultText; - }else { - cacheString = resultText; + //发送消息 + WebSocket webSocket = cacheWebSocket.get(clientId); + if(webSocket != null){ + log.info("3.1 开始发送数据音频流啦"); + // 将音频数据转换为 Base64 编码的字符串 + String base64Audio = Base64.getEncoder().encodeToString(bytes); + String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }"; + webSocket.send(message); + // 3. 提交音频并请求转录 + webSocket.send("{\"type\": \"input_audio_buffer.commit\"}"); + webSocket.send("{\"type\": \"response.create\"}"); } - cacheClientTts.put(clientId,cacheString); - }catch (Exception e){ e.printStackTrace(); } @@ -243,5 +282,87 @@ public class ChatWebSocketHandler { return null; } + /** + * 创建STT WebSocket 客户端链接 + * @param clientId 客户端ID + */ + private void createWhisperRealtimeSocket(String clientId){ + try{ + OkHttpClient client = new OkHttpClient(); + CountDownLatch latch = new CountDownLatch(1); + // 设置 WebSocket 请求 + Request request = new Request.Builder() + .url(API_URL) + .addHeader("Authorization", "Bearer " + apiKey) + .addHeader("OpenAI-Beta", "realtime=v1") + .build(); + client.newWebSocket(request, new WebSocketListener() { + @Override + public void onOpen(WebSocket webSocket, Response response) { + System.out.println("✅ WebSocket 连接成功"); + //发送配置 + JSONObject config = new JSONObject(); + JSONObject sessionConfig = new JSONObject(); + JSONObject transcription = new JSONObject(); + JSONObject turnDetection = new JSONObject(); + // 配置转录参数 + transcription.put("model", "gpt-4o-mini-transcribe"); + transcription.put("language", language); // 中文 + // 配置断句检测 + turnDetection.put("type", "server_vad"); + turnDetection.put("prefix_padding_ms", 300); + turnDetection.put("silence_duration_ms", 10); + // 组装完整配置 + sessionConfig.put("input_audio_transcription", transcription); + sessionConfig.put("turn_detection", turnDetection); + config.put("type", "transcription_session.update"); + config.put("session", sessionConfig); + webSocket.send(config.toString()); + // 1. 启动音频缓冲 + webSocket.send("{\"type\": \"input_audio_buffer.start\"}"); + //存储客户端webSocket对象,对数据进行隔离处理 + cacheWebSocket.put(clientId,webSocket); + } + + @Override + public void onMessage(WebSocket webSocket, String text) { + System.out.println("📩 收到转录结果: " + text); + //对数据进行解析 + if(StrUtil.isNotEmpty(text)){ + Map mapResultData = JSONUtil.toBean(text,Map.class); + if("conversation.item.input_audio_transcription.delta".equals(mapResultData.get("type"))){ + String resultText = mapResultData.get("delta"); + //进行客户端文本数据存储 + String cacheString = cacheClientTts.get(clientId); + if(StrUtil.isNotEmpty(cacheString)){ + cacheString = cacheString+resultText; + }else { + cacheString = resultText; + } + cacheClientTts.put(clientId,cacheString); + } + } + } + + @Override + public void onFailure(WebSocket webSocket, Throwable t, Response response) { + System.err.println("❌ 连接失败: " + t.getMessage()); + latch.countDown(); + } + + @Override + public void onClosing(WebSocket webSocket, int code, String reason) { + System.out.println("⚠️ 连接即将关闭: " + reason); + webSocket.close(1000, null); + latch.countDown(); + } + }); + // 等待 WebSocket 关闭 + latch.await(); + }catch (Exception e){ + e.printStackTrace(); + } + } + } diff --git a/vetti-common/pom.xml b/vetti-common/pom.xml index d9c1369..9da9e72 100644 --- a/vetti-common/pom.xml +++ b/vetti-common/pom.xml @@ -165,6 +165,15 @@ httpmime + + org.java-websocket + Java-WebSocket + + + org.apache.tomcat.embed + tomcat-embed-websocket + + diff --git a/vetti-common/src/main/java/com/vetti/common/ai/whisper/OpenAIRealtimeClient.java b/vetti-common/src/main/java/com/vetti/common/ai/whisper/OpenAIRealtimeClient.java new file mode 100644 index 0000000..4732cf3 --- /dev/null +++ b/vetti-common/src/main/java/com/vetti/common/ai/whisper/OpenAIRealtimeClient.java @@ -0,0 +1,165 @@ +package com.vetti.common.ai.whisper; + +import cn.hutool.json.JSONObject; +import lombok.extern.slf4j.Slf4j; +import org.java_websocket.client.WebSocketClient; +import org.java_websocket.handshake.ServerHandshake; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.ByteBuffer; +import java.util.Base64; +import java.util.HashMap; +import java.util.Map; + +@Slf4j +public class OpenAIRealtimeClient extends WebSocketClient { + + // 构造方法:初始化连接地址和请求头(携带认证信息) + public OpenAIRealtimeClient(String apiKey) throws URISyntaxException { + super( + new URI("wss://api.openai.com/v1/realtime?intent=transcription"), + buildHeaders(apiKey) // 构建请求头 + ); + } + + // 构建请求头(携带认证和内容类型) + private static Map buildHeaders(String apiKey) { + Map headers = new HashMap<>(); + headers.put("Authorization", "Bearer " + apiKey); // 核心认证头 + headers.put("Content-Type", "application/json"); // 根据接口要求调整 + headers.put("OpenAI-Beta", "realtime=v1"); // 若接口要求 beta 版本标识 + return headers; + } + + // 连接成功回调 + @Override + public void onOpen(ServerHandshake handshakedata) { + System.out.println("WebSocket 连接已打开,状态码:" + handshakedata.getHttpStatus()); + // 连接成功后可发送初始化消息(如配置转录参数) + sendInitMessage(); + } + + // 接收服务器消息回调(处理转录结果) + @Override + public void onMessage(String message) { + System.out.println("收到转录文本:" + message); + // 解析 JSON 格式的转录结果(可使用 Jackson/Gson 等库) + } + + // 接收二进制消息(若服务器返回二进制数据,如音频片段确认) + @Override + public void onMessage(ByteBuffer bytes) { + System.out.println("收到二进制数据,长度:" + bytes.remaining()); + // 处理二进制消息(如需) + } + + // 连接关闭回调 + @Override + public void onClose(int code, String reason, boolean remote) { + System.out.println("连接关闭,状态码:" + code + ",原因:" + reason); + } + + // 连接错误回调 + @Override + public void onError(Exception ex) { + System.err.println("连接错误:" + ex.getMessage()); + ex.printStackTrace(); + } + + // 发送初始化消息(根据 OpenAI 接口要求配置转录参数) + private void sendInitMessage() { + JSONObject config = new JSONObject(); + JSONObject sessionConfig = new JSONObject(); + JSONObject transcription = new JSONObject(); + JSONObject turnDetection = new JSONObject(); + + // 配置转录参数 + transcription.put("model", "gpt-4o-mini-transcribe"); + transcription.put("language", "zh"); // 中文 + + // 配置断句检测 + turnDetection.put("type", "server_vad"); + turnDetection.put("prefix_padding_ms", 300); + turnDetection.put("silence_duration_ms", 10); + + // 组装完整配置 + sessionConfig.put("input_audio_transcription", transcription); + sessionConfig.put("turn_detection", turnDetection); + config.put("type", "transcription_session.update"); + config.put("session", sessionConfig); + + this.send(config.toString()); + System.out.println("已发送初始化配置"); + } + + // 发送音频数据(核心:将麦克风/文件的音频流发送到服务器) + public void sendAudioData(byte[] audioBytes) { + if (this.isOpen()) { + // 按接口要求封装音频数据(通常为 JSON 包裹二进制,或直接发送二进制) + // OpenAI要求语音数据以Base64编码发送 + String base64Chunk = Base64.getEncoder().encodeToString(audioBytes); + String audioJson = "{\n" + + " \"type\": \"input_audio_buffer.append\",\n" + + " \"audio\": \""+base64Chunk+"\"\n" + + "}"; + this.send(audioJson); +// this.send(audioBytes); + System.out.println("已发送音频数据,长度:" + audioBytes.length); + } else { + System.err.println("连接未打开,无法发送音频"); + } + + } + + public void commitData() { + String base64Chunk = Base64.getEncoder().encodeToString(new byte[0]); + String audioJson = "{\n" + + " \"type\": \"input_audio_buffer.append\",\n" + + " \"audio\": \""+base64Chunk+"\"\n" + + "}"; + this.send(audioJson); + } + + public static void main(String[] args) { + String apiKey = "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"; // 替换为你的 OpenAI API Key + + try { + // 创建客户端 + OpenAIRealtimeClient client = new OpenAIRealtimeClient(apiKey); + // 连接服务器 + client.connectBlocking(); // 阻塞式连接(也可使用非阻塞 connect()) + + // 模拟发送音频数据(实际应从麦克风或文件读取) + // 注意:音频格式需符合接口要求(通常为 PCM 16kHz 单声道等) + // 读取本地PCM文件(16kHz单声道16位)并分片发送 + try (java.io.FileInputStream fis = new java.io.FileInputStream("/Users/wangxiangshun/Desktop/临时文件/output1112.mp3")) { + byte[] buffer = new byte[6400]; // 200ms的PCM数据(16000Hz*16位*1声道=32000字节/秒 → 6400字节/200ms) + int len; + while ((len = fis.read(buffer)) != -1) { + byte[] chunk = new byte[len]; + System.arraycopy(buffer, 0, chunk, 0, len); + client.sendAudioData(chunk); + Thread.sleep(200); // 模拟实时流(每200ms发送一次) + } + } + + //发送一个空的二进制流 + // 4. 发送结束标记(空二进制消息) + client.sendAudioData(new byte[600]); + + // 等待转录完成(实际场景需根据业务逻辑控制) + Thread.sleep(20000); + + // 关闭连接 + client.close(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + // 模拟读取音频数据(实际需用音频库采集,如 Java Sound API 或 VLCJ) + private static byte[] readAudioFromSource() { + // 示例:返回空字节数组(实际应填充真实音频数据) + return new byte[1024]; + } +} diff --git a/vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java b/vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java new file mode 100644 index 0000000..619b96d --- /dev/null +++ b/vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java @@ -0,0 +1,115 @@ +package com.vetti.common.ai.whisper; + +import cn.hutool.json.JSONObject; +import okhttp3.*; + +import javax.sound.sampled.AudioFormat; +import javax.sound.sampled.AudioSystem; +import javax.sound.sampled.DataLine; +import javax.sound.sampled.TargetDataLine; +import java.util.Base64; +import java.util.concurrent.CountDownLatch; + +public class RealtimeTranscriptionMicrophone { + + private static final String API_KEY = "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"; + private static final String URL = "wss://api.openai.com/v1/realtime?intent=transcription"; + private static final int SAMPLE_RATE = 16000; // 16kHz + private static final int BUFFER_SIZE = 4096; // 4 KB 每次读取 + private static final int BITS_PER_SAMPLE = 16; // 每样本 16 位 + + public static void main(String[] args) throws Exception { + + OkHttpClient client = new OkHttpClient(); + CountDownLatch latch = new CountDownLatch(1); + + // 设置 WebSocket 请求 + Request request = new Request.Builder() + .url(URL) + .addHeader("Authorization", "Bearer " + API_KEY) + .addHeader("OpenAI-Beta", "realtime=v1") + .build(); + + WebSocket ws = client.newWebSocket(request, new WebSocketListener() { + @Override + public void onOpen(WebSocket webSocket, Response response) { + System.out.println("✅ WebSocket 连接成功"); + + //发送配置 + JSONObject config = new JSONObject(); + JSONObject sessionConfig = new JSONObject(); + JSONObject transcription = new JSONObject(); + JSONObject turnDetection = new JSONObject(); + + // 配置转录参数 + transcription.put("model", "gpt-4o-mini-transcribe"); + transcription.put("language", "zh"); // 中文 + // 配置断句检测 + turnDetection.put("type", "server_vad"); + turnDetection.put("prefix_padding_ms", 300); + turnDetection.put("silence_duration_ms", 10); + // 组装完整配置 + sessionConfig.put("input_audio_transcription", transcription); + sessionConfig.put("turn_detection", turnDetection); + config.put("type", "transcription_session.update"); + config.put("session", sessionConfig); + + webSocket.send(config.toString()); + + // 1. 启动音频缓冲 + webSocket.send("{\"type\": \"input_audio_buffer.start\"}"); + + // 2. 开始录音并实时发送 + new Thread(() -> { + try { + // 设置麦克风输入流 + AudioFormat format = new AudioFormat(SAMPLE_RATE, BITS_PER_SAMPLE, 1, true, false); + DataLine.Info info = new DataLine.Info(TargetDataLine.class, format); + TargetDataLine line = (TargetDataLine) AudioSystem.getLine(info); + line.open(format); + line.start(); + + byte[] buffer = new byte[BUFFER_SIZE]; + int bytesRead; + while ((bytesRead = line.read(buffer, 0, buffer.length)) > 0) { + // 将音频数据转换为 Base64 编码的字符串 + byte[] audioData = new byte[bytesRead]; + System.arraycopy(buffer, 0, audioData, 0, bytesRead); + String base64Audio = Base64.getEncoder().encodeToString(audioData); + String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }"; + webSocket.send(message); + } + + // 3. 提交音频并请求转录 + webSocket.send("{\"type\": \"input_audio_buffer.commit\"}"); + webSocket.send("{\"type\": \"response.create\"}"); + } catch (Exception e) { + e.printStackTrace(); + } + }).start(); + } + + @Override + public void onMessage(WebSocket webSocket, String text) { + System.out.println("📩 收到转录结果: " + text); + } + + @Override + public void onFailure(WebSocket webSocket, Throwable t, Response response) { + System.err.println("❌ 连接失败: " + t.getMessage()); + latch.countDown(); + } + + @Override + public void onClosing(WebSocket webSocket, int code, String reason) { + System.out.println("⚠️ 连接即将关闭: " + reason); + webSocket.close(1000, null); + latch.countDown(); + } + }); + + // 等待 WebSocket 关闭 + latch.await(); + } +} +