add:添加火山引擎语音合成

This commit is contained in:
Maxchen
2025-11-06 19:43:52 +08:00
parent f3e1aa6cdd
commit e402330692
15 changed files with 768 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
package org.ruoyi.aihuman.config;
public class WebConfig {
}

View File

@@ -0,0 +1,4 @@
package org.ruoyi.aihuman.controller;
public class AihumanVolcengineController {
}

View File

@@ -0,0 +1,4 @@
package org.ruoyi.aihuman.domain;
public class VoiceRequest {
}

View File

@@ -0,0 +1,26 @@
package com.speech.protocol;
import lombok.Getter;
@Getter
public enum CompressionBits {
None_((byte) 0),
Gzip((byte) 0b1),
Custom((byte) 0b11),
;
private final byte value;
CompressionBits(byte b) {
this.value = b;
}
public static CompressionBits fromValue(int value) {
for (CompressionBits type : CompressionBits.values()) {
if (type.value == value) {
return type;
}
}
throw new IllegalArgumentException("Unknown CompressionBits value: " + value);
}
}

View File

@@ -0,0 +1,90 @@
package com.speech.protocol;
import lombok.Getter;
@Getter
public enum EventType {
// Default event
NONE(0),
// Upstream Connection events (1-49)
START_CONNECTION(1),
START_TASK(1),
FINISH_CONNECTION(2),
FINISH_TASK(2),
// Downstream Connection events (50-99)
CONNECTION_STARTED(50),
TASK_STARTED(50),
CONNECTION_FAILED(51),
TASK_FAILED(51),
CONNECTION_FINISHED(52),
TASK_FINISHED(52),
// Upstream Session events (100-149)
START_SESSION(100),
CANCEL_SESSION(101),
FINISH_SESSION(102),
// Downstream Session events (150-199)
SESSION_STARTED(150),
SESSION_CANCELED(151),
SESSION_FINISHED(152),
SESSION_FAILED(153),
USAGE_RESPONSE(154),
CHARGE_DATA(154),
// Upstream General events (200-249)
TASK_REQUEST(200),
UPDATE_CONFIG(201),
// Downstream General events (250-299)
AUDIO_MUTED(250),
// Upstream TTS events (300-349)
SAY_HELLO(300),
// Downstream TTS events (350-399)
TTS_SENTENCE_START(350),
TTS_SENTENCE_END(351),
TTS_RESPONSE(352),
TTS_ENDED(359),
PODCAST_ROUND_START(360),
PODCAST_ROUND_RESPONSE(361),
PODCAST_ROUND_END(362),
// Downstream ASR events (450-499)
ASR_INFO(450),
ASR_RESPONSE(451),
ASR_ENDED(459),
// Upstream Chat events (500-549)
CHAT_TTS_TEXT(500),
// Downstream Chat events (550-599)
CHAT_RESPONSE(550),
CHAT_ENDED(559),
// Subtitle events (650-699)
SOURCE_SUBTITLE_START(650),
SOURCE_SUBTITLE_RESPONSE(651),
SOURCE_SUBTITLE_END(652),
TRANSLATION_SUBTITLE_START(653),
TRANSLATION_SUBTITLE_RESPONSE(654),
TRANSLATION_SUBTITLE_END(655);
private final int value;
EventType(int value) {
this.value = value;
}
public static EventType fromValue(int value) {
for (EventType type : EventType.values()) {
if (type.value == value) {
return type;
}
}
throw new IllegalArgumentException("Unknown EventType value: " + value);
}
}

View File

@@ -0,0 +1,27 @@
package com.speech.protocol;
import lombok.Getter;
@Getter
public enum HeaderSizeBits {
HeaderSize4((byte) 1),
HeaderSize8((byte) 2),
HeaderSize12((byte) 3),
HeaderSize16((byte) 4),
;
private final byte value;
HeaderSizeBits(byte b) {
this.value = b;
}
public static HeaderSizeBits fromValue(int value) {
for (HeaderSizeBits type : HeaderSizeBits.values()) {
if (type.value == value) {
return type;
}
}
throw new IllegalArgumentException("Unknown HeaderSizeBits value: " + value);
}
}

View File

@@ -0,0 +1,220 @@
package com.speech.protocol;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import java.io.ByteArrayOutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.StandardCharsets;
@Slf4j
@Data
public class Message {
private byte version = VersionBits.Version1.getValue();
private byte headerSize = HeaderSizeBits.HeaderSize4.getValue();
private MsgType type;
private MsgTypeFlagBits flag;
private byte serialization = SerializationBits.JSON.getValue();
private byte compression = 0;
private EventType event;
private String sessionId;
private String connectId;
private int sequence;
private int errorCode;
private byte[] payload;
public Message(MsgType type, MsgTypeFlagBits flag) {
this.type = type;
this.flag = flag;
}
public static Message unmarshal(byte[] data) throws Exception {
ByteBuffer buffer = ByteBuffer.wrap(data);
byte type_and_flag = data[1];
MsgType type = MsgType.fromValue((type_and_flag >> 4) & 0x0F);
MsgTypeFlagBits flag = MsgTypeFlagBits.fromValue(type_and_flag & 0x0F);
// Read version and header size
int versionAndHeaderSize = buffer.get();
VersionBits version = VersionBits.fromValue((versionAndHeaderSize >> 4) & 0x0F);
HeaderSizeBits headerSize = HeaderSizeBits.fromValue(versionAndHeaderSize & 0x0F);
// Skip second byte
buffer.get();
// Read serialization and compression method
int serializationCompression = buffer.get();
SerializationBits serialization = SerializationBits.fromValue((serializationCompression >> 4) & 0x0F);
CompressionBits compression = CompressionBits.fromValue(serializationCompression & 0x0F);
// Skip padding bytes
int headerSizeInt = 4 * (int) headerSize.getValue();
int paddingSize = headerSizeInt - 3;
while (paddingSize > 0) {
buffer.get();
paddingSize -= 1;
}
Message message = new Message(type, flag);
message.setVersion(version.getValue());
message.setHeaderSize(headerSize.getValue());
message.setSerialization(serialization.getValue());
message.setCompression(compression.getValue());
// Read sequence if present
if (flag == MsgTypeFlagBits.POSITIVE_SEQ || flag == MsgTypeFlagBits.NEGATIVE_SEQ) {
// Read 4 bytes from ByteBuffer and parse as int (big-endian)
byte[] sequeueBytes = new byte[4];
if (buffer.remaining() >= 4) {
buffer.get(sequeueBytes); // Read 4 bytes into array
ByteBuffer wrapper = ByteBuffer.wrap(sequeueBytes);
wrapper.order(ByteOrder.BIG_ENDIAN); // Set big-endian order
message.setSequence(wrapper.getInt());
}
}
// Read event if present
if (flag == MsgTypeFlagBits.WITH_EVENT) {
// Read 4 bytes from ByteBuffer and parse as int (big-endian)
byte[] eventBytes = new byte[4];
if (buffer.remaining() >= 4) {
buffer.get(eventBytes); // Read 4 bytes into array
ByteBuffer wrapper = ByteBuffer.wrap(eventBytes);
wrapper.order(ByteOrder.BIG_ENDIAN); // Set big-endian order
message.setEvent(EventType.fromValue(wrapper.getInt()));
}
if (type != MsgType.ERROR && !(message.event == EventType.START_CONNECTION
|| message.event == EventType.FINISH_CONNECTION ||
message.event == EventType.CONNECTION_STARTED
|| message.event == EventType.CONNECTION_FAILED ||
message.event == EventType.CONNECTION_FINISHED)) {
// Read sessionId if present
int sessionIdLength = buffer.getInt();
if (sessionIdLength > 0) {
byte[] sessionIdBytes = new byte[sessionIdLength];
buffer.get(sessionIdBytes);
message.setSessionId(new String(sessionIdBytes, StandardCharsets.UTF_8));
}
}
if (message.event == EventType.CONNECTION_STARTED || message.event == EventType.CONNECTION_FAILED
|| message.event == EventType.CONNECTION_FINISHED) {
// Read connectId if present
int connectIdLength = buffer.getInt();
if (connectIdLength > 0) {
byte[] connectIdBytes = new byte[connectIdLength];
buffer.get(connectIdBytes);
message.setConnectId(new String(connectIdBytes, StandardCharsets.UTF_8));
}
}
}
// Read errorCode if present
if (type == MsgType.ERROR) {
// Read 4 bytes from ByteBuffer and parse as int (big-endian)
byte[] errorCodeBytes = new byte[4];
if (buffer.remaining() >= 4) {
buffer.get(errorCodeBytes); // Read 4 bytes into array
ByteBuffer wrapper = ByteBuffer.wrap(errorCodeBytes);
wrapper.order(ByteOrder.BIG_ENDIAN); // Set big-endian order
message.setErrorCode(wrapper.getInt());
}
}
// Read remaining bytes as payload
if (buffer.remaining() > 0) {
// 4 bytes length
int payloadLength = buffer.getInt();
if (payloadLength > 0) {
byte[] payloadBytes = new byte[payloadLength];
buffer.get(payloadBytes);
message.setPayload(payloadBytes);
}
}
return message;
}
public byte[] marshal() throws Exception {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
// Write header
buffer.write((version & 0x0F) << 4 | (headerSize & 0x0F));
buffer.write((type.getValue() & 0x0F) << 4 | (flag.getValue() & 0x0F));
buffer.write((serialization & 0x0F) << 4 | (compression & 0x0F));
int headerSizeInt = 4 * (int) headerSize;
int padding = headerSizeInt - buffer.size();
while (padding > 0) {
buffer.write(0);
padding -= 1;
}
// Write event if present
if (event != null) {
byte[] eventBytes = ByteBuffer.allocate(4).putInt(event.getValue()).array();
buffer.write(eventBytes);
}
// Write sessionId if present
if (sessionId != null) {
byte[] sessionIdBytes = sessionId.getBytes(StandardCharsets.UTF_8);
buffer.write(ByteBuffer.allocate(4).putInt(sessionIdBytes.length).array());
buffer.write(sessionIdBytes);
}
// Write connectId if present
if (connectId != null) {
byte[] connectIdBytes = connectId.getBytes(StandardCharsets.UTF_8);
buffer.write(ByteBuffer.allocate(4).putInt(connectIdBytes.length).array());
buffer.write(connectIdBytes);
}
// Write sequence if present
if (sequence != 0) {
buffer.write(ByteBuffer.allocate(4).putInt(sequence).array());
}
// Write errorCode if present
if (errorCode != 0) {
buffer.write(ByteBuffer.allocate(4).putInt(errorCode).array());
}
// Write payload if present
if (payload != null && payload.length > 0) {
buffer.write(ByteBuffer.allocate(4).putInt(payload.length).array());
buffer.write(payload);
}
return buffer.toByteArray();
}
@Override
public String toString() {
switch (this.type) {
case AUDIO_ONLY_SERVER:
case AUDIO_ONLY_CLIENT:
if (this.flag == MsgTypeFlagBits.POSITIVE_SEQ || this.flag == MsgTypeFlagBits.NEGATIVE_SEQ) {
return String.format("MsgType: %s, EventType: %s, Sequence: %d, PayloadSize: %d", this.type, this.event, this.sequence,
this.payload != null ? this.payload.length : 0);
}
return String.format("MsgType: %s, EventType: %s, PayloadSize: %d", this.type, this.event,
this.payload != null ? this.payload.length : 0);
case ERROR:
return String.format("MsgType: %s, EventType: %s, ErrorCode: %d, Payload: %s", this.type, this.event, this.errorCode,
this.payload != null ? new String(this.payload) : "null");
default:
if (this.flag == MsgTypeFlagBits.POSITIVE_SEQ || this.flag == MsgTypeFlagBits.NEGATIVE_SEQ) {
return String.format("MsgType: %s, EventType: %s, Sequence: %d, Payload: %s",
this.type, this.event, this.sequence,
this.payload != null ? new String(this.payload) : "null");
}
return String.format("MsgType: %s, EventType: %s, Payload: %s", this.type, this.event,
this.payload != null ? new String(this.payload) : "null");
}
}
}

View File

@@ -0,0 +1,29 @@
package com.speech.protocol;
import lombok.Getter;
@Getter
public enum MsgType {
INVALID((byte) 0),
FULL_CLIENT_REQUEST((byte) 0b1),
AUDIO_ONLY_CLIENT((byte) 0b10),
FULL_SERVER_RESPONSE((byte) 0b1001),
AUDIO_ONLY_SERVER((byte) 0b1011),
FRONT_END_RESULT_SERVER((byte) 0b1100),
ERROR((byte) 0b1111);
private final byte value;
MsgType(byte value) {
this.value = value;
}
public static MsgType fromValue(int value) {
for (MsgType type : MsgType.values()) {
if (type.value == value) {
return type;
}
}
throw new IllegalArgumentException("Unknown MsgType value: " + value);
}
}

View File

@@ -0,0 +1,27 @@
package com.speech.protocol;
import lombok.Getter;
@Getter
public enum MsgTypeFlagBits {
NO_SEQ((byte) 0), // Non-terminating packet without sequence number
POSITIVE_SEQ((byte) 0b1), // Non-terminating packet with positive sequence number
LAST_NO_SEQ((byte) 0b10), // Terminating packet without sequence number
NEGATIVE_SEQ((byte) 0b11), // Terminating packet with negative sequence number
WITH_EVENT((byte) 0b100); // Packet containing event number
private final byte value;
MsgTypeFlagBits(byte value) {
this.value = value;
}
public static MsgTypeFlagBits fromValue(int value) {
for (MsgTypeFlagBits flag : MsgTypeFlagBits.values()) {
if (flag.value == value) {
return flag;
}
}
throw new IllegalArgumentException("Unknown MsgTypeFlagBits value: " + value);
}
}

View File

@@ -0,0 +1,27 @@
package com.speech.protocol;
import lombok.Getter;
@Getter
public enum SerializationBits {
Raw((byte) 0),
JSON((byte) 0b1),
Thrift((byte) 0b11),
Custom((byte) 0b1111),
;
private final byte value;
SerializationBits(byte b) {
this.value = b;
}
public static SerializationBits fromValue(int value) {
for (SerializationBits type : SerializationBits.values()) {
if (type.value == value) {
return type;
}
}
throw new IllegalArgumentException("Unknown SerializationBits value: " + value);
}
}

View File

@@ -0,0 +1,115 @@
package com.speech.protocol;
import lombok.extern.slf4j.Slf4j;
import org.java_websocket.client.WebSocketClient;
import org.java_websocket.handshake.ServerHandshake;
import java.net.URI;
import java.nio.ByteBuffer;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
@Slf4j
public class SpeechWebSocketClient extends WebSocketClient {
private final BlockingQueue<Message> messageQueue = new LinkedBlockingQueue<>();
public SpeechWebSocketClient(URI serverUri, Map<String, String> headers) {
super(serverUri, headers);
}
@Override
public void onOpen(ServerHandshake handshakedata) {
log.info("WebSocket connection established, Logid: {}", handshakedata.getFieldValue("x-tt-logid"));
}
@Override
public void onMessage(String message) {
log.warn("Received unexpected text message: {}", message);
}
@Override
public void onMessage(ByteBuffer bytes) {
try {
Message message = Message.unmarshal(bytes.array());
messageQueue.put(message);
} catch (Exception e) {
log.error("Failed to parse message", e);
}
}
@Override
public void onClose(int code, String reason, boolean remote) {
log.info("WebSocket connection closed: code={}, reason={}, remote={}", code, reason, remote);
}
@Override
public void onError(Exception ex) {
log.error("WebSocket error", ex);
}
public void sendStartConnection() throws Exception {
Message message = new Message(MsgType.FULL_CLIENT_REQUEST, MsgTypeFlagBits.WITH_EVENT);
message.setEvent(EventType.START_CONNECTION);
message.setPayload("{}".getBytes());
sendMessage(message);
}
public void sendFinishConnection() throws Exception {
Message message = new Message(MsgType.FULL_CLIENT_REQUEST, MsgTypeFlagBits.WITH_EVENT);
message.setEvent(EventType.FINISH_CONNECTION);
sendMessage(message);
}
public void sendStartSession(byte[] payload, String sessionId) throws Exception {
Message message = new Message(MsgType.FULL_CLIENT_REQUEST, MsgTypeFlagBits.WITH_EVENT);
message.setEvent(EventType.START_SESSION);
message.setSessionId(sessionId);
message.setPayload(payload);
sendMessage(message);
}
public void sendFinishSession(String sessionId) throws Exception {
Message message = new Message(MsgType.FULL_CLIENT_REQUEST, MsgTypeFlagBits.WITH_EVENT);
message.setEvent(EventType.FINISH_SESSION);
message.setSessionId(sessionId);
message.setPayload("{}".getBytes());
sendMessage(message);
}
public void sendTaskRequest(byte[] payload, String sessionId) throws Exception {
Message message = new Message(MsgType.FULL_CLIENT_REQUEST, MsgTypeFlagBits.WITH_EVENT);
message.setEvent(EventType.TASK_REQUEST);
message.setSessionId(sessionId);
message.setPayload(payload);
sendMessage(message);
}
public void sendFullClientMessage(byte[] payload) throws Exception {
Message message = new Message(MsgType.FULL_CLIENT_REQUEST, MsgTypeFlagBits.NO_SEQ);
message.setPayload(payload);
sendMessage(message);
}
public void sendMessage(Message message) throws Exception {
log.info("Send: {}", message);
send(message.marshal());
}
public Message receiveMessage() throws InterruptedException {
Message message = messageQueue.take();
log.info("Receive: {}", message);
return message;
}
public Message waitForMessage(MsgType type, EventType event) throws InterruptedException {
while (true) {
Message message = receiveMessage();
if (message.getType() == type && message.getEvent() == event) {
return message;
} else {
throw new RuntimeException("Unexpected message: " + message);
}
}
}
}

View File

@@ -0,0 +1,27 @@
package com.speech.protocol;
import lombok.Getter;
@Getter
public enum VersionBits {
Version1((byte) 1),
Version2((byte) 2),
Version3((byte) 3),
Version4((byte) 4),
;
private final byte value;
VersionBits(byte b) {
this.value = b;
}
public static VersionBits fromValue(int value) {
for (VersionBits type : VersionBits.values()) {
if (type.value == value) {
return type;
}
}
throw new IllegalArgumentException("Unknown VersionBits value: " + value);
}
}

View File

@@ -0,0 +1,4 @@
package org.ruoyi.aihuman.service;
public interface AihumanVolcengineService {
}

View File

@@ -0,0 +1,4 @@
package org.ruoyi.aihuman.service.impl;
public class AihumanVolcengineServiceImpl {
}

View File

@@ -0,0 +1,160 @@
package com.speech.volcengine;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.speech.protocol.EventType;
import com.speech.protocol.Message;
import com.speech.protocol.MsgType;
import com.speech.protocol.SpeechWebSocketClient;
import lombok.extern.slf4j.Slf4j;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.net.URI;
import java.nio.file.Files;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
@Slf4j
public class Bidirection {
private static final String ENDPOINT = "wss://openspeech.bytedance.com/api/v3/tts/bidirection";
private static final ObjectMapper objectMapper = new ObjectMapper();
/**
* Get resource ID based on voice type
*
* @param voice Voice type string
* @return Corresponding resource ID
*/
public static String voiceToResourceId(String voice) {
// Map different voice types to resource IDs based on actual needs
if (voice.startsWith("S_")) {
return "volc.megatts.default";
}
return "volc.service_type.10029";
}
public static void main(String[] args) throws Exception {
// Configure parameters
String appId = System.getProperty("appId", "1055299334");
String accessToken = System.getProperty("accessToken", "fOHuq4R4dirMYiOruCU3Ek9q75zV0KVW");
String resourceId = System.getProperty("resourceId", "seed-tts-2.0");
String voice = System.getProperty("voice", "zh_female_vv_uranus_bigtts");
String text = System.getProperty("text", "你好呀!如果你有关于老婆相关的问题,比如怎么让她开心、怎么照顾她等,都可以跟我说哦,我会根据【马斯克·陈】提供的关爱老婆百事通里的信息给你分析和建议哒。");
String encoding = System.getProperty("encoding", "mp3");
if (appId.isEmpty() || accessToken.isEmpty()) {
throw new IllegalArgumentException("Please set appId and accessToken system properties");
}
// Set request headers
Map<String, String> headers = Map.of(
"X-Api-App-Key", appId,
"X-Api-Access-Key", accessToken,
"X-Api-Resource-Id", resourceId.isEmpty() ? voiceToResourceId(voice) : resourceId,
"X-Api-Connect-Id", UUID.randomUUID().toString());
// Create WebSocket client
SpeechWebSocketClient client = new SpeechWebSocketClient(new URI(ENDPOINT), headers);
try {
client.connectBlocking();
Map<String, Object> request = Map.of(
"user", Map.of("uid", UUID.randomUUID().toString()),
"namespace", "BidirectionalTTS",
"req_params", Map.of(
"speaker", voice,
"audio_params", Map.of(
"format", encoding,
"sample_rate", 24000,
"enable_timestamp", true),
// additions requires a JSON string
"additions", objectMapper.writeValueAsString(Map.of(
"disable_markdown_filter", false))));
// Start connection
client.sendStartConnection();
// Wait for connection started
client.waitForMessage(MsgType.FULL_SERVER_RESPONSE, EventType.CONNECTION_STARTED);
// Process each sentence
String[] sentences = text.split("");
boolean audioReceived = false;
for (int i = 0; i < sentences.length; i++) {
if (sentences[i].trim().isEmpty()) {
continue;
}
String sessionId = UUID.randomUUID().toString();
ByteArrayOutputStream audioStream = new ByteArrayOutputStream();
// Start session
Map<String, Object> startReq = Map.of(
"user", request.get("user"),
"namespace", request.get("namespace"),
"req_params", request.get("req_params"),
"event", EventType.START_SESSION.getValue());
client.sendStartSession(objectMapper.writeValueAsBytes(startReq), sessionId);
// Wait for session started
client.waitForMessage(MsgType.FULL_SERVER_RESPONSE, EventType.SESSION_STARTED);
// Send text
for (char c : sentences[i].toCharArray()) {
// Create new req_params with text
@SuppressWarnings("unchecked")
Map<String, Object> currentReqParams = new HashMap<>(
(Map<String, Object>) request.get("req_params"));
currentReqParams.put("text", String.valueOf(c));
// Create current request
Map<String, Object> currentRequest = Map.of(
"user", request.get("user"),
"namespace", request.get("namespace"),
"req_params", currentReqParams,
"event", EventType.TASK_REQUEST.getValue());
client.sendTaskRequest(objectMapper.writeValueAsBytes(currentRequest), sessionId);
}
// End session
client.sendFinishSession(sessionId);
// Receive response
while (true) {
Message msg = client.receiveMessage();
switch (msg.getType()) {
case FULL_SERVER_RESPONSE:
break;
case AUDIO_ONLY_SERVER:
if (!audioReceived && audioStream.size() > 0) {
audioReceived = true;
}
if (msg.getPayload() != null) {
audioStream.write(msg.getPayload());
}
break;
default:
throw new RuntimeException("Unexpected message: " + msg);
}
if (msg.getEvent() == EventType.SESSION_FINISHED) {
break;
}
}
if (audioStream.size() > 0) {
String fileName = String.format("%s_session_%d.%s", voice, i, encoding);
Files.write(new File(fileName).toPath(), audioStream.toByteArray());
log.info("Audio saved to file: {}", fileName);
}
}
if (!audioReceived) {
throw new RuntimeException("No audio data received");
}
// End connection
client.sendFinishConnection();
} finally {
client.closeBlocking();
}
}
}