STT 数据流处理

2025-10-20 11:12:29 +08:00
parent 0d0c6c32f0
commit 36cceafac5
3 changed files with 183 additions and 96 deletions
--- a/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java
+++ b/vetti-admin/src/main/java/com/vetti/socket/ChatWebSocketHandler.java
@@ -14,6 +14,7 @@ import okhttp3.*;
 import org.apache.commons.io.FileUtils;
 import org.springframework.stereotype.Component;
 import javax.sound.sampled.AudioFormat;
 import javax.sound.sampled.AudioInputStream;
 import javax.sound.sampled.AudioSystem;
 import javax.websocket.*;
@@ -21,10 +22,7 @@ import javax.websocket.server.PathParam;
 import javax.websocket.server.ServerEndpoint;
 import java.io.*;
 import java.nio.ByteBuffer;
-import java.util.ArrayList;
+import java.util.*;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 /**
@@ -200,51 +198,6 @@ public class ChatWebSocketHandler {
        }
    }
 //    // 接收二进制消息（流数据）
 //    @OnMessage
 //    public void onBinaryMessage(Session session, @PathParam("clientId") String clientId, ByteBuffer byteBuffer) {
 //        log.info("1、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
 //        log.info("客户端ID为:{}", clientId);
 //        // 处理二进制流数据
 //        byte[] bytes = new byte[byteBuffer.remaining()];
 //        //从缓冲区中读取数据并存储到指定的字节数组中
 //        byteBuffer.get(bytes);
 //        log.info("2、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
 //        // 生成唯一文件名
 //        String fileName = clientId + "_" + System.currentTimeMillis() + ".wav";
 //        String pathUrl = RuoYiConfig.getProfile()+VOICE_STORAGE_DIR + fileName;
 //        log.info("文件路径为:{}", pathUrl);
 //        log.info("3、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
 //        try{
 //            log.info("文件流的大小为:{}",bytes.length);
 //            saveAsWebM(bytes,pathUrl);
 //            //接收到数据流后直接就进行SST处理
 //            //发送消息
 //            WebSocket webSocket = cacheWebSocket.get(clientId);
 //            log.info("获取的socket对象为:{}",webSocket);
 //            if(webSocket != null){
 ////                 1. 启动音频缓冲
 ////                webSocket.send("{\"type\": \"input_audio_buffer.start\"}");
 //                log.info("3.1 开始发送数据音频流啦");
 //                // 将音频数据转换为 Base64 编码的字符串
 //                //进行转换
 //                // 转换音频格式
 //                AudioFormat format = new AudioFormat(SAMPLE_RATE, BITS_PER_SAMPLE, 1, true, false);
 //                byte[] outputAudioBytes = convertAudio(bytes, format);
 //                String base64Audio = Base64.getEncoder().encodeToString(outputAudioBytes);
 //                String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }";
 //                webSocket.send(message);
 //                log.info("4、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
 //                // 3. 提交音频并请求转录
 ////                webSocket.send("{\"type\": \"input_audio_buffer.commit\"}");
 ////                webSocket.send("{\"type\": \"response.create\"}");
 //            }
 //        }catch (Exception e){
 //            e.printStackTrace();
 //        }
 //
 //    }
    // 接收二进制消息（流数据）
    @OnMessage
    public void onBinaryMessage(Session session, @PathParam("clientId") String clientId, ByteBuffer byteBuffer) {
@@ -254,17 +207,66 @@ public class ChatWebSocketHandler {
        byte[] bytes = new byte[byteBuffer.remaining()];
        //从缓冲区中读取数据并存储到指定的字节数组中
        byteBuffer.get(bytes);
        log.info("2、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
        // 生成唯一文件名
        String fileName = clientId + "_" + System.currentTimeMillis() + ".wav";
        String pathUrl = RuoYiConfig.getProfile()+VOICE_STORAGE_DIR + fileName;
        log.info("文件路径为:{}", pathUrl);
        log.info("3、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
        try{
            log.info("文件流的大小为:{}",bytes.length);
            saveAsWebM(bytes,pathUrl);
            //接收到数据流后直接就进行SST处理
            //语音格式转换
            String fileOutName = clientId + "_" + System.currentTimeMillis() + ".pcm";
            String pathOutUrl = RuoYiConfig.getProfile()+VOICE_STORAGE_DIR + fileOutName;
            handleAudioToPCM(pathUrl,pathOutUrl);
            //发送消息
            WebSocket webSocket = cacheWebSocket.get(clientId);
            log.info("获取的socket对象为:{}",webSocket);
            if(webSocket != null){
 //                 1. 启动音频缓冲
 //                webSocket.send("{\"type\": \"input_audio_buffer.start\"}");
                log.info("3.1 开始发送数据音频流啦");
                File outputFile = new File(pathOutUrl);  // 输出PCM格式文件
                ByteBuffer buffer = ByteBuffer.wrap(FileUtils.readFileToByteArray(outputFile));
                byte[] outBytes = new byte[buffer.remaining()];
                //从缓冲区中读取数据并存储到指定的字节数组中
                buffer.get(outBytes);
                String base64Audio = Base64.getEncoder().encodeToString(outBytes);
                String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }";
                webSocket.send(message);
                log.info("4、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
                // 3. 提交音频并请求转录
 //                webSocket.send("{\"type\": \"input_audio_buffer.commit\"}");
 //                webSocket.send("{\"type\": \"response.create\"}");
            }
        }catch (Exception e){
            e.printStackTrace();
        }
        // 1. 获取当前会话的缓存
        List<byte[]> fragments = fragmentCache.get(clientId);
        if (fragments == null) {
            fragments = new ArrayList<>();
            fragmentCache.put(clientId, fragments);
        }
        fragments.add(bytes);
        fragmentCache.put(clientId, fragments);
    }
 //    // 接收二进制消息（流数据）
 //    @OnMessage
 //    public void onBinaryMessage(Session session, @PathParam("clientId") String clientId, ByteBuffer byteBuffer) {
 //        log.info("1、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
 //        log.info("客户端ID为:{}", clientId);
 //        // 处理二进制流数据
 //        byte[] bytes = new byte[byteBuffer.remaining()];
 //        //从缓冲区中读取数据并存储到指定的字节数组中
 //        byteBuffer.get(bytes);
 //
 //        // 1. 获取当前会话的缓存
 //        List<byte[]> fragments = fragmentCache.get(clientId);
 //        if (fragments == null) {
 //            fragments = new ArrayList<>();
 //            fragmentCache.put(clientId, fragments);
 //        }
 //        fragments.add(bytes);
 //        fragmentCache.put(clientId, fragments);
 //    }
    // 连接关闭时调用
    @OnClose
    public void onClose(Session session, CloseReason reason) {
@@ -278,28 +280,6 @@ public class ChatWebSocketHandler {
        throwable.printStackTrace();
    }
 //    public static byte[] convertAudio(byte[] inputAudioBytes, AudioFormat targetFormat) throws Exception {
 //        // 将 byte[] 转换为 AudioInputStream
 //        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(inputAudioBytes);
 //        AudioInputStream inputAudioStream = new AudioInputStream(byteArrayInputStream, targetFormat, inputAudioBytes.length);
 //
 //        // 创建目标格式的 AudioInputStream
 //        AudioInputStream outputAudioStream = AudioSystem.getAudioInputStream(targetFormat, inputAudioStream);
 //
 //        // 获取输出音频的 byte[]
 //        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
 //        byte[] buffer = new byte[1024];
 //        int bytesRead;
 //
 //        // 从 AudioInputStream 读取数据并写入 ByteArrayOutputStream
 //        while ((bytesRead = outputAudioStream.read(buffer)) != -1) {
 //            byteArrayOutputStream.write(buffer, 0, bytesRead);
 //        }
 //
 //        // 返回转换后的 byte[]
 //        return byteArrayOutputStream.toByteArray();
 //    }
    /**
     * 将字节数组保存为WebM文件
     *
@@ -466,6 +446,49 @@ public class ChatWebSocketHandler {
        return result;
    }
    /**
     * 语音流文件格式转换
     * @param pathUrl
     * @param outPathUrl
     */
    private void handleAudioToPCM(String pathUrl,String outPathUrl){
        File inputFile = new File(pathUrl);  // 输入音频文件
        File outputFile = new File(outPathUrl);  // 输出PCM格式文件
        try {
            // 读取音频文件
            AudioInputStream inputAudioStream = AudioSystem.getAudioInputStream(inputFile);
            // 获取音频文件的格式信息
            AudioFormat sourceFormat = inputAudioStream.getFormat();
            System.out.println("Input Audio Format: " + sourceFormat);
            // 设置目标PCM格式 (可以是16-bit, 8kHz, Mono, Linear PCM)
            AudioFormat pcmFormat = new AudioFormat(
                    AudioFormat.Encoding.PCM_SIGNED,
                    sourceFormat.getSampleRate(),
                    16,  // 16-bit samples
                    1,    // 单声道
                    2,    // 每个样本2字节（16位）
                    sourceFormat.getSampleRate(),
                    false // 大端模式
            );
            // 获取PCM格式的音频流
            AudioInputStream pcmAudioStream = AudioSystem.getAudioInputStream(pcmFormat, inputAudioStream);
            // 创建输出文件流
            FileOutputStream fos = new FileOutputStream(outputFile);
            byte[] buffer = new byte[1024];
            int bytesRead;
            // 将PCM音频数据写入输出文件
            while ((bytesRead = pcmAudioStream.read(buffer)) != -1) {
                fos.write(buffer, 0, bytesRead);
            }
            // 关闭流
            pcmAudioStream.close();
            fos.close();
            System.out.println("Audio has been converted to PCM format and saved at: " + outputFile.getAbsolutePath());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
 }
--- a/vetti-common/src/main/java/com/vetti/common/ai/whisper/AudioToPCM.java
+++ b/vetti-common/src/main/java/com/vetti/common/ai/whisper/AudioToPCM.java
@@ -0,0 +1,54 @@
 package com.vetti.common.ai.whisper;
 import javax.sound.sampled.*;
 import java.io.*;
 public class AudioToPCM {
    public static void main(String[] args) {
        File inputFile = new File("/Users/wangxiangshun/Desktop/临时文件/110/buffer.wav");  // 输入音频文件
        File outputFile = new File("/Users/wangxiangshun/Desktop/临时文件/110/output_pcm_audio.pcm");  // 输出PCM格式文件
        try {
            // 读取音频文件
            AudioInputStream inputAudioStream = AudioSystem.getAudioInputStream(inputFile);
            // 获取音频文件的格式信息
            AudioFormat sourceFormat = inputAudioStream.getFormat();
            System.out.println("Input Audio Format: " + sourceFormat);
            // 设置目标PCM格式 (可以是16-bit, 8kHz, Mono, Linear PCM)
            AudioFormat pcmFormat = new AudioFormat(
                    AudioFormat.Encoding.PCM_SIGNED,
                    sourceFormat.getSampleRate(),
                    16,  // 16-bit samples
                    1,    // 单声道
                    2,    // 每个样本2字节（16位）
                    sourceFormat.getSampleRate(),
                    false // 大端模式
            );
            // 获取PCM格式的音频流
            AudioInputStream pcmAudioStream = AudioSystem.getAudioInputStream(pcmFormat, inputAudioStream);
            // 创建输出文件流
            FileOutputStream fos = new FileOutputStream(outputFile);
            byte[] buffer = new byte[1024];
            int bytesRead;
            // 将PCM音频数据写入输出文件
            while ((bytesRead = pcmAudioStream.read(buffer)) != -1) {
                fos.write(buffer, 0, bytesRead);
            }
            // 关闭流
            pcmAudioStream.close();
            fos.close();
            System.out.println("Audio has been converted to PCM format and saved at: " + outputFile.getAbsolutePath());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
 }
--- a/vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java
+++ b/vetti-common/src/main/java/com/vetti/common/ai/whisper/RealtimeTranscriptionMicrophone.java
@@ -2,11 +2,14 @@ package com.vetti.common.ai.whisper;
 import cn.hutool.json.JSONObject;
 import okhttp3.*;
 import org.apache.commons.io.FileUtils;
 import javax.sound.sampled.AudioFormat;
 import javax.sound.sampled.AudioSystem;
 import javax.sound.sampled.DataLine;
 import javax.sound.sampled.TargetDataLine;
 import java.io.File;
 import java.nio.ByteBuffer;
 import java.util.Base64;
 import java.util.concurrent.CountDownLatch;
@@ -63,23 +66,30 @@ public class RealtimeTranscriptionMicrophone {
                new Thread(() -> {
                    try {
                        // 设置麦克风输入流
-                        AudioFormat format = new AudioFormat(SAMPLE_RATE, BITS_PER_SAMPLE, 1, true, false);
+//                        AudioFormat format = new AudioFormat(SAMPLE_RATE, BITS_PER_SAMPLE, 1, true, false);
-                        DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
+//                        DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
-                        TargetDataLine line = (TargetDataLine) AudioSystem.getLine(info);
+//                        TargetDataLine line = (TargetDataLine) AudioSystem.getLine(info);
-                        line.open(format);
+//                        line.open(format);
-                        line.start();
+//                        line.start();
-
+//
-                        byte[] buffer = new byte[BUFFER_SIZE];
+//                        byte[] buffer = new byte[BUFFER_SIZE];
-                        int bytesRead;
+//                        int bytesRead;
-                        while ((bytesRead = line.read(buffer, 0, buffer.length)) > 0) {
+//                        while ((bytesRead = line.read(buffer, 0, buffer.length)) > 0) {
-                            // 将音频数据转换为 Base64 编码的字符串
+//                            // 将音频数据转换为 Base64 编码的字符串
-                            byte[] audioData = new byte[bytesRead];
+//                            byte[] audioData = new byte[bytesRead];
-                            System.arraycopy(buffer, 0, audioData, 0, bytesRead);
+//                            System.arraycopy(buffer, 0, audioData, 0, bytesRead);
-                            String base64Audio = Base64.getEncoder().encodeToString(audioData);
+//                            String base64Audio = Base64.getEncoder().encodeToString(audioData);
 //                            String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }";
 //                            webSocket.send(message);
 //                        }
                        File outputFile = new File("/Users/wangxiangshun/Desktop/临时文件/110/output_pcm_audio.pcm");  // 输出PCM格式文件
                        ByteBuffer byteBuffer = ByteBuffer.wrap(FileUtils.readFileToByteArray(outputFile));
                        byte[] bytes = new byte[byteBuffer.remaining()];
                        //从缓冲区中读取数据并存储到指定的字节数组中
                        byteBuffer.get(bytes);
                        String base64Audio = Base64.getEncoder().encodeToString(bytes);
                        String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }";
                        webSocket.send(message);
                        }
                        // 3. 提交音频并请求转录
 //                        webSocket.send("{\"type\": \"input_audio_buffer.commit\"}");
 //                        webSocket.send("{\"type\": \"response.create\"}");