logo
On this page

Drive Digital Human Speech via WebSocket


Introduction

This document describes how to use WebSocket to transmit PCM audio data to drive digital human speech.

Prerequisites

Before using WebSocket drive, ensure that:

Implementation Process

1 Get information for WebSocket driving

Call the DriveByWsStream interface to get information for WebSocket driving. The interface returns WebSocket driving information, including the WebSocket address (including authentication information) and driving task ID.

2 Establish WebSocket Connection

Note
  • For the same digital human video stream task, please do not establish multiple connections simultaneously and send data in parallel on different connections.
  • When establishing a WebSocket connection, please skip certificate verification.

Establish a WebSocket connection through the WebSocket address (including authentication information).

3 Send a Start Command

After the WebSocket connection is established, send the start driving command through this connection. The protocol is as follows:

Request Parameters

ParameterTypeRequiredDescription
ActionStringYesFixed value: Start
PayloadObjectYesRequest body.
└DriveIdStringYesObtained from the response parameters of the Get WebSocket Driving Information interface.
Note
Please call the `DriveByWsStream` interface to get a new `DriveId` every time you drive the digital human by WebSocket.
└SampleRateNumberNoAudio sampling rate, currently supports the following two sampling rates:
  • 16000
  • 24000
If not filled, the default value is 16000.

Request Example

{
    "Action": "Start",
    "Payload": {
        "DriveId": "xxxxxxxxxxx",
        "SampleRate": 16000
    }
}

4 Send PCM Audio Data

Send the original PCM binary audio data to the digital human API platform service through the WebSocket connection.

5 Send a Stop Command

After the PCM audio data is sent, please send the Stop command in time to avoid causing digital human stuttering. The protocol is as follows:

Note

If the Stop command is not sent in time, the digital human backend will mistakenly judge that there is still remaining PCM data to be received, and will continue to wait, causing the data to be insufficient to complete the inference, and then causing the digital human to stutter.

Request Parameters

ParameterTypeRequiredDescription
ActionStringYesFixed value: Stop
PayloadObjectYesRequest body.
└DriveIdStringYesObtained from the response parameters of the Get WebSocket Driving Information interface.
Note
Please call the `DriveByWsStream` interface to get a new `DriveId` every time you drive the digital human by WebSocket.

Request Example

{
    "Action": "Stop",
    "Payload": {
        "DriveId": "xxxxxxxxxxx"
    }
}

6 Disconnect WebSocket Connection

If there are no subsequent WebSocket driving requests, please disconnect the connection in time.

Sample Code

The following is the sample code for driving the digital human to speak by transmitting PCM audio data through WebSocket. You can refer to the sample code to implement your own business logic.

package main

import (
	"bytes"
	"crypto/md5"
	"crypto/tls"
	"encoding/hex"
	"encoding/json"
	"fmt"
	"io"
	"io/ioutil"
	"math/rand"
	"net/http"
	"net/url"
	"os"
	"strconv"
	"time"

	"github.com/gorilla/websocket"
)

// Configuration
const (
	AppID        = 123456               // Replace with your AppID
	ServerSecret = "your_server_secret" // Replace with your ServerSecret
	TaskID       = "your_task_id"       // Replace with your digital human video stream task ID
	APIHost      = "aigc-digitalhuman-api.zegotech.cn"
	PCMFilePath  = "./audio.pcm" // Replace with your PCM audio file path
)

// DriveByWsStreamResponse Response for getting WebSocket driving information
type DriveByWsStreamResponse struct {
	Code      int    `json:"Code"`
	Message   string `json:"Message"`
	RequestID string `json:"RequestId"`
	Data      struct {
		DriveID    string `json:"DriveId"`
		WssAddress string `json:"WssAddress"`
	} `json:"Data"`
}

// The command to start
type StartCommand struct {
	Action  string `json:"Action"`
	Payload struct {
		DriveID    string `json:"DriveId"`
		SampleRate int    `json:"SampleRate"`
	} `json:"Payload"`
}

// The command to stop
type StopCommand struct {
	Action  string `json:"Action"`
	Payload struct {
		DriveID string `json:"DriveId"`
	} `json:"Payload"`
}

func generateSignature(appId uint32, signatureNonce string, serverSecret string, timestamp int64) (Signature string) {
	data := fmt.Sprintf("%d%s%s%d", appId, signatureNonce, serverSecret, timestamp)
	h := md5.New()
	h.Write([]byte(data))
	return hex.EncodeToString(h.Sum(nil))
}

// Get information for WebSocket driving
func getDriveByWsStreamInfo() (*DriveByWsStreamResponse, error) {
	// Construct request parameters
	signatureNonce := strconv.FormatInt(rand.Int63(), 10)
	timestamp := time.Now().Unix()
	params := map[string]string{
		"Action":           "DriveByWsStream",
		"AppId":            strconv.FormatInt(AppID, 10),
		"SignatureNonce":   signatureNonce,
		"SignatureVersion": "2.0",
		"Timestamp":        strconv.FormatInt(timestamp, 10),
	}

	// Generate signature
	signature := generateSignature(AppID, signatureNonce, ServerSecret, timestamp)
	params["Signature"] = signature

	// Construct request URL
	baseURL := fmt.Sprintf("https://%s/", APIHost)
	reqURL, err := url.Parse(baseURL)
	if err != nil {
		return nil, err
	}

	// Add query parameters
	query := reqURL.Query()
	for k, v := range params {
		query.Set(k, v)
	}
	reqURL.RawQuery = query.Encode()

	// Construct request body
	reqBody := map[string]string{
		"TaskId": TaskID,
	}
	reqBodyBytes, err := json.Marshal(reqBody)
	if err != nil {
		return nil, err
	}

	// Send HTTP request
	req, err := http.NewRequest("POST", reqURL.String(), bytes.NewBuffer(reqBodyBytes))
	if err != nil {
		return nil, err
	}
	req.Header.Set("Content-Type", "application/json")

	client := &http.Client{}
	resp, err := client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	// Parse response
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return nil, err
	}

	var response DriveByWsStreamResponse
	err = json.Unmarshal(body, &response)
	if err != nil {
		return nil, err
	}

	if response.Code != 0 {
		return nil, fmt.Errorf("API error: %d - %s", response.Code, response.Message)
	}

	return &response, nil
}

// Drive the digital human to speak via WebSocket
func driveDigitalHumanWithWebSocket(wssAddress, driveID string, pcmFilePath string) error {
	// 1. Establish a WebSocket connection
	dialer := websocket.DefaultDialer
	dialer.TLSClientConfig = &tls.Config{
		InsecureSkipVerify: true, // Ignore certificate verification
	}
	c, _, err := dialer.Dial(wssAddress, nil)
	if err != nil {
		return fmt.Errorf("failed to connect to WebSocket: %v", err)
	}
	defer c.Close()

	// 2. Send the Start command
	startCmd := StartCommand{
		Action: "Start",
		Payload: struct {
			DriveID    string `json:"DriveId"`
			SampleRate int    `json:"SampleRate"`
		}{
			DriveID:    driveID,
			SampleRate: 16000, // Use 16000Hz sampling rate
		},
	}

	startCmdBytes, err := json.Marshal(startCmd)
	if err != nil {
		return fmt.Errorf("failed to serialize the Start command: %v", err)
	}

	err = c.WriteMessage(websocket.TextMessage, startCmdBytes)
	if err != nil {
		return fmt.Errorf("failed to send the Start command: %v", err)
	}

	// 3. Read the PCM file and send audio data
	pcmFile, err := os.Open(pcmFilePath)
	if err != nil {
		return fmt.Errorf("failed to open the PCM file: %v", err)
	}
	defer pcmFile.Close()

	// Read and send 4KB of PCM data each time
	buffer := make([]byte, 4096)
	for {
		n, err := pcmFile.Read(buffer)
		if err == io.EOF {
			break
		}
		if err != nil {
			return fmt.Errorf("failed to read the PCM file: %v", err)
		}

		// Send PCM binary data
		err = c.WriteMessage(websocket.BinaryMessage, buffer[:n])
		if err != nil {
			return fmt.Errorf("failed to send PCM data: %v", err)
		}

		// Control the sending rate to avoid sending too fast
		time.Sleep(100 * time.Millisecond)
	}

	// 4. Send the Stop command
	stopCmd := StopCommand{
		Action: "Stop",
		Payload: struct {
			DriveID string `json:"DriveId"`
		}{
			DriveID: driveID,
		},
	}

	stopCmdBytes, err := json.Marshal(stopCmd)
	if err != nil {
		return fmt.Errorf("failed to serialize the Stop command: %v", err)
	}

	err = c.WriteMessage(websocket.TextMessage, stopCmdBytes)
	if err != nil {
		return fmt.Errorf("failed to send the Stop command: %v", err)
	}

	return nil
}

func main() {
	// 1. Get information for WebSocket driving
	fmt.Println("Getting WebSocket driving information...")
	response, err := getDriveByWsStreamInfo()
	if err != nil {
		fmt.Printf("Failed to get WebSocket driving information: %v\n", err)
		return
	}

	fmt.Printf("Successfully got WebSocket driving information:\n")
	fmt.Printf("DriveId: %s\n", response.Data.DriveID)
	fmt.Printf("WssAddress: %s\n", response.Data.WssAddress)

	// 2. Drive the digital human to speak via WebSocket
	fmt.Println("Driving the digital human to speak via WebSocket...")
	err = driveDigitalHumanWithWebSocket(response.Data.WssAddress, response.Data.DriveID, PCMFilePath)
	if err != nil {
		fmt.Printf("Failed to drive the digital human to speak via WebSocket: %v\n", err)
		return
	}

	fmt.Println("Successfully drove the digital human to speak via WebSocket!")
}

Previous

Synthesize Real-Time Streaming Digital Human Video

Next

API Overview