STT 数据流处理

This commit is contained in:
2025-10-20 11:12:29 +08:00
parent 0d0c6c32f0
commit 36cceafac5
3 changed files with 183 additions and 96 deletions

View File

@@ -14,6 +14,7 @@ import okhttp3.*;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioInputStream; import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem; import javax.sound.sampled.AudioSystem;
import javax.websocket.*; import javax.websocket.*;
@@ -21,10 +22,7 @@ import javax.websocket.server.PathParam;
import javax.websocket.server.ServerEndpoint; import javax.websocket.server.ServerEndpoint;
import java.io.*; import java.io.*;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.util.ArrayList; import java.util.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
/** /**
@@ -200,51 +198,6 @@ public class ChatWebSocketHandler {
} }
} }
// // 接收二进制消息(流数据)
// @OnMessage
// public void onBinaryMessage(Session session, @PathParam("clientId") String clientId, ByteBuffer byteBuffer) {
// log.info("1、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
// log.info("客户端ID为:{}", clientId);
// // 处理二进制流数据
// byte[] bytes = new byte[byteBuffer.remaining()];
// //从缓冲区中读取数据并存储到指定的字节数组中
// byteBuffer.get(bytes);
// log.info("2、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
// // 生成唯一文件名
// String fileName = clientId + "_" + System.currentTimeMillis() + ".wav";
// String pathUrl = RuoYiConfig.getProfile()+VOICE_STORAGE_DIR + fileName;
// log.info("文件路径为:{}", pathUrl);
// log.info("3、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
// try{
// log.info("文件流的大小为:{}",bytes.length);
// saveAsWebM(bytes,pathUrl);
// //接收到数据流后直接就进行SST处理
// //发送消息
// WebSocket webSocket = cacheWebSocket.get(clientId);
// log.info("获取的socket对象为:{}",webSocket);
// if(webSocket != null){
//// 1. 启动音频缓冲
//// webSocket.send("{\"type\": \"input_audio_buffer.start\"}");
// log.info("3.1 开始发送数据音频流啦");
// // 将音频数据转换为 Base64 编码的字符串
// //进行转换
// // 转换音频格式
// AudioFormat format = new AudioFormat(SAMPLE_RATE, BITS_PER_SAMPLE, 1, true, false);
// byte[] outputAudioBytes = convertAudio(bytes, format);
// String base64Audio = Base64.getEncoder().encodeToString(outputAudioBytes);
// String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }";
// webSocket.send(message);
// log.info("4、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
// // 3. 提交音频并请求转录
//// webSocket.send("{\"type\": \"input_audio_buffer.commit\"}");
//// webSocket.send("{\"type\": \"response.create\"}");
// }
// }catch (Exception e){
// e.printStackTrace();
// }
//
// }
// 接收二进制消息(流数据) // 接收二进制消息(流数据)
@OnMessage @OnMessage
public void onBinaryMessage(Session session, @PathParam("clientId") String clientId, ByteBuffer byteBuffer) { public void onBinaryMessage(Session session, @PathParam("clientId") String clientId, ByteBuffer byteBuffer) {
@@ -254,17 +207,66 @@ public class ChatWebSocketHandler {
byte[] bytes = new byte[byteBuffer.remaining()]; byte[] bytes = new byte[byteBuffer.remaining()];
//从缓冲区中读取数据并存储到指定的字节数组中 //从缓冲区中读取数据并存储到指定的字节数组中
byteBuffer.get(bytes); byteBuffer.get(bytes);
log.info("2、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
// 生成唯一文件名
String fileName = clientId + "_" + System.currentTimeMillis() + ".wav";
String pathUrl = RuoYiConfig.getProfile()+VOICE_STORAGE_DIR + fileName;
log.info("文件路径为:{}", pathUrl);
log.info("3、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
try{
log.info("文件流的大小为:{}",bytes.length);
saveAsWebM(bytes,pathUrl);
//接收到数据流后直接就进行SST处理
//语音格式转换
String fileOutName = clientId + "_" + System.currentTimeMillis() + ".pcm";
String pathOutUrl = RuoYiConfig.getProfile()+VOICE_STORAGE_DIR + fileOutName;
handleAudioToPCM(pathUrl,pathOutUrl);
//发送消息
WebSocket webSocket = cacheWebSocket.get(clientId);
log.info("获取的socket对象为:{}",webSocket);
if(webSocket != null){
// 1. 启动音频缓冲
// webSocket.send("{\"type\": \"input_audio_buffer.start\"}");
log.info("3.1 开始发送数据音频流啦");
File outputFile = new File(pathOutUrl); // 输出PCM格式文件
ByteBuffer buffer = ByteBuffer.wrap(FileUtils.readFileToByteArray(outputFile));
byte[] outBytes = new byte[buffer.remaining()];
//从缓冲区中读取数据并存储到指定的字节数组中
buffer.get(outBytes);
String base64Audio = Base64.getEncoder().encodeToString(outBytes);
String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }";
webSocket.send(message);
log.info("4、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
// 3. 提交音频并请求转录
// webSocket.send("{\"type\": \"input_audio_buffer.commit\"}");
// webSocket.send("{\"type\": \"response.create\"}");
}
}catch (Exception e){
e.printStackTrace();
}
// 1. 获取当前会话的缓存
List<byte[]> fragments = fragmentCache.get(clientId);
if (fragments == null) {
fragments = new ArrayList<>();
fragmentCache.put(clientId, fragments);
}
fragments.add(bytes);
fragmentCache.put(clientId, fragments);
} }
// // 接收二进制消息(流数据)
// @OnMessage
// public void onBinaryMessage(Session session, @PathParam("clientId") String clientId, ByteBuffer byteBuffer) {
// log.info("1、开始接收数据流时间:{}",System.currentTimeMillis()/1000);
// log.info("客户端ID为:{}", clientId);
// // 处理二进制流数据
// byte[] bytes = new byte[byteBuffer.remaining()];
// //从缓冲区中读取数据并存储到指定的字节数组中
// byteBuffer.get(bytes);
//
// // 1. 获取当前会话的缓存
// List<byte[]> fragments = fragmentCache.get(clientId);
// if (fragments == null) {
// fragments = new ArrayList<>();
// fragmentCache.put(clientId, fragments);
// }
// fragments.add(bytes);
// fragmentCache.put(clientId, fragments);
// }
// 连接关闭时调用 // 连接关闭时调用
@OnClose @OnClose
public void onClose(Session session, CloseReason reason) { public void onClose(Session session, CloseReason reason) {
@@ -278,28 +280,6 @@ public class ChatWebSocketHandler {
throwable.printStackTrace(); throwable.printStackTrace();
} }
// public static byte[] convertAudio(byte[] inputAudioBytes, AudioFormat targetFormat) throws Exception {
// // 将 byte[] 转换为 AudioInputStream
// ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(inputAudioBytes);
// AudioInputStream inputAudioStream = new AudioInputStream(byteArrayInputStream, targetFormat, inputAudioBytes.length);
//
// // 创建目标格式的 AudioInputStream
// AudioInputStream outputAudioStream = AudioSystem.getAudioInputStream(targetFormat, inputAudioStream);
//
// // 获取输出音频的 byte[]
// ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
// byte[] buffer = new byte[1024];
// int bytesRead;
//
// // 从 AudioInputStream 读取数据并写入 ByteArrayOutputStream
// while ((bytesRead = outputAudioStream.read(buffer)) != -1) {
// byteArrayOutputStream.write(buffer, 0, bytesRead);
// }
//
// // 返回转换后的 byte[]
// return byteArrayOutputStream.toByteArray();
// }
/** /**
* 将字节数组保存为WebM文件 * 将字节数组保存为WebM文件
* *
@@ -466,6 +446,49 @@ public class ChatWebSocketHandler {
return result; return result;
} }
/**
* 语音流文件格式转换
* @param pathUrl
* @param outPathUrl
*/
private void handleAudioToPCM(String pathUrl,String outPathUrl){
File inputFile = new File(pathUrl); // 输入音频文件
File outputFile = new File(outPathUrl); // 输出PCM格式文件
try {
// 读取音频文件
AudioInputStream inputAudioStream = AudioSystem.getAudioInputStream(inputFile);
// 获取音频文件的格式信息
AudioFormat sourceFormat = inputAudioStream.getFormat();
System.out.println("Input Audio Format: " + sourceFormat);
// 设置目标PCM格式 (可以是16-bit, 8kHz, Mono, Linear PCM)
AudioFormat pcmFormat = new AudioFormat(
AudioFormat.Encoding.PCM_SIGNED,
sourceFormat.getSampleRate(),
16, // 16-bit samples
1, // 单声道
2, // 每个样本2字节16位
sourceFormat.getSampleRate(),
false // 大端模式
);
// 获取PCM格式的音频流
AudioInputStream pcmAudioStream = AudioSystem.getAudioInputStream(pcmFormat, inputAudioStream);
// 创建输出文件流
FileOutputStream fos = new FileOutputStream(outputFile);
byte[] buffer = new byte[1024];
int bytesRead;
// 将PCM音频数据写入输出文件
while ((bytesRead = pcmAudioStream.read(buffer)) != -1) {
fos.write(buffer, 0, bytesRead);
}
// 关闭流
pcmAudioStream.close();
fos.close();
System.out.println("Audio has been converted to PCM format and saved at: " + outputFile.getAbsolutePath());
} catch (Exception e) {
e.printStackTrace();
}
}
} }

View File

@@ -0,0 +1,54 @@
package com.vetti.common.ai.whisper;
import javax.sound.sampled.*;
import java.io.*;
public class AudioToPCM {
public static void main(String[] args) {
File inputFile = new File("/Users/wangxiangshun/Desktop/临时文件/110/buffer.wav"); // 输入音频文件
File outputFile = new File("/Users/wangxiangshun/Desktop/临时文件/110/output_pcm_audio.pcm"); // 输出PCM格式文件
try {
// 读取音频文件
AudioInputStream inputAudioStream = AudioSystem.getAudioInputStream(inputFile);
// 获取音频文件的格式信息
AudioFormat sourceFormat = inputAudioStream.getFormat();
System.out.println("Input Audio Format: " + sourceFormat);
// 设置目标PCM格式 (可以是16-bit, 8kHz, Mono, Linear PCM)
AudioFormat pcmFormat = new AudioFormat(
AudioFormat.Encoding.PCM_SIGNED,
sourceFormat.getSampleRate(),
16, // 16-bit samples
1, // 单声道
2, // 每个样本2字节16位
sourceFormat.getSampleRate(),
false // 大端模式
);
// 获取PCM格式的音频流
AudioInputStream pcmAudioStream = AudioSystem.getAudioInputStream(pcmFormat, inputAudioStream);
// 创建输出文件流
FileOutputStream fos = new FileOutputStream(outputFile);
byte[] buffer = new byte[1024];
int bytesRead;
// 将PCM音频数据写入输出文件
while ((bytesRead = pcmAudioStream.read(buffer)) != -1) {
fos.write(buffer, 0, bytesRead);
}
// 关闭流
pcmAudioStream.close();
fos.close();
System.out.println("Audio has been converted to PCM format and saved at: " + outputFile.getAbsolutePath());
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@@ -2,11 +2,14 @@ package com.vetti.common.ai.whisper;
import cn.hutool.json.JSONObject; import cn.hutool.json.JSONObject;
import okhttp3.*; import okhttp3.*;
import org.apache.commons.io.FileUtils;
import javax.sound.sampled.AudioFormat; import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioSystem; import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.DataLine; import javax.sound.sampled.DataLine;
import javax.sound.sampled.TargetDataLine; import javax.sound.sampled.TargetDataLine;
import java.io.File;
import java.nio.ByteBuffer;
import java.util.Base64; import java.util.Base64;
import java.util.concurrent.CountDownLatch; import java.util.concurrent.CountDownLatch;
@@ -63,23 +66,30 @@ public class RealtimeTranscriptionMicrophone {
new Thread(() -> { new Thread(() -> {
try { try {
// 设置麦克风输入流 // 设置麦克风输入流
AudioFormat format = new AudioFormat(SAMPLE_RATE, BITS_PER_SAMPLE, 1, true, false); // AudioFormat format = new AudioFormat(SAMPLE_RATE, BITS_PER_SAMPLE, 1, true, false);
DataLine.Info info = new DataLine.Info(TargetDataLine.class, format); // DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
TargetDataLine line = (TargetDataLine) AudioSystem.getLine(info); // TargetDataLine line = (TargetDataLine) AudioSystem.getLine(info);
line.open(format); // line.open(format);
line.start(); // line.start();
//
byte[] buffer = new byte[BUFFER_SIZE]; // byte[] buffer = new byte[BUFFER_SIZE];
int bytesRead; // int bytesRead;
while ((bytesRead = line.read(buffer, 0, buffer.length)) > 0) { // while ((bytesRead = line.read(buffer, 0, buffer.length)) > 0) {
// 将音频数据转换为 Base64 编码的字符串 // // 将音频数据转换为 Base64 编码的字符串
byte[] audioData = new byte[bytesRead]; // byte[] audioData = new byte[bytesRead];
System.arraycopy(buffer, 0, audioData, 0, bytesRead); // System.arraycopy(buffer, 0, audioData, 0, bytesRead);
String base64Audio = Base64.getEncoder().encodeToString(audioData); // String base64Audio = Base64.getEncoder().encodeToString(audioData);
// String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }";
// webSocket.send(message);
// }
File outputFile = new File("/Users/wangxiangshun/Desktop/临时文件/110/output_pcm_audio.pcm"); // 输出PCM格式文件
ByteBuffer byteBuffer = ByteBuffer.wrap(FileUtils.readFileToByteArray(outputFile));
byte[] bytes = new byte[byteBuffer.remaining()];
//从缓冲区中读取数据并存储到指定的字节数组中
byteBuffer.get(bytes);
String base64Audio = Base64.getEncoder().encodeToString(bytes);
String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }"; String message = "{ \"type\": \"input_audio_buffer.append\", \"audio\": \"" + base64Audio + "\" }";
webSocket.send(message); webSocket.send(message);
}
// 3. 提交音频并请求转录 // 3. 提交音频并请求转录
// webSocket.send("{\"type\": \"input_audio_buffer.commit\"}"); // webSocket.send("{\"type\": \"input_audio_buffer.commit\"}");
// webSocket.send("{\"type\": \"response.create\"}"); // webSocket.send("{\"type\": \"response.create\"}");