在VR場景中我想把語音轉(zhuǎn)為文字,使用起來更高大上一點(diǎn)兒桑孩,于是拜鹤,我翻遍了目前用的比較多的集中SDK:科大訊飛,百度語音流椒,微軟TTS敏簿,親加通訊
其中,親加通訊和百度的SDK主要是面對移動(dòng)端宣虾,TTS主要面對Windows惯裕,科大訊飛移動(dòng)端桌面端都有。
我是想用到Oculus中的安岂,所以要選用windows轻猖,然后困難就來了。域那。
訊飛的桌面SDK是用C語言寫的咙边,想用到unity里面對于我這種一點(diǎn)兒C都不會(huì)的太難了猜煮,TTS沒怎么研究,但是感覺蠻復(fù)雜的败许,然后經(jīng)過我百般摸索王带,發(fā)現(xiàn)百度語音不光有移動(dòng)端,還有REST市殷,使用HTTP請求來使用的愕撰,不限平臺(tái)!
于是醋寝,我就決定采用百度的REST方式來實(shí)現(xiàn)語音轉(zhuǎn)為文字搞挣。
這個(gè)是使用文檔 自己看一下,下面我們直接上代碼
在unity中新建一個(gè)腳本
private string token; //access_token
private string cuid = "你自己隨便寫一個(gè)用戶標(biāo)識(shí)"; //用戶標(biāo)識(shí)
private string format = "wav"; //語音格式
private int rate = 8000; //采樣率
private int channel = 1; //聲道數(shù)
private string speech; //語音數(shù)據(jù)音羞,進(jìn)行base64編碼
private int len; //原始語音長度
private string lan = "zh"; //語種
private string grant_Type = "client_credentials";
private string client_ID = "你的百度appkey"; //百度appkey
private string client_Secret = "你的百度SecretKey"; //百度Secret Key
private string baiduAPI = "http://vop.baidu.com/server_api";
private string getTokenAPIPath = "https://openapi.baidu.com/oauth/2.0/token";
private Byte[] clipByte;
/// <summary>
/// 轉(zhuǎn)換出來的TEXT
/// </summary>
public static string audioToString;
private AudioSource aud;
private int audioLength;//錄音的長度
以上是需要聲明的變量囱桨。其中AppID和SecretKey需要你注冊成為百度的開發(fā)者,然后再應(yīng)用管理中去看
繼續(xù)代碼:
/// <summary>
/// 獲取百度用戶令牌
/// </summary>
/// <param name="url">獲取的url</param>
/// <returns></returns>
private IEnumerator GetToken(string url)
{
WWWForm getTForm = new WWWForm();
getTForm.AddField("grant_type", grant_Type);
getTForm.AddField("client_id", client_ID);
getTForm.AddField("client_secret", client_Secret);
WWW getTW = new WWW(url, getTForm);
yield return getTW;
if (getTW.isDone)
{
if (getTW.error == null)
{
token = JsonMapper.ToObject(getTW.text)["access_token"].ToString();
StartCoroutine(GetAudioString(baiduAPI));
}
else
Debug.LogError(getTW.error);
}
}
上面這段代碼是獲取百度的Token嗅绰,有Token才有權(quán)使用API舍肠。
然后是發(fā)送轉(zhuǎn)換請求的方法:
/// <summary>
/// 把語音轉(zhuǎn)換為文字
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
private IEnumerator GetAudioString(string url)
{
JsonWriter jw = new JsonWriter();
jw.WriteObjectStart();
jw.WritePropertyName("format");
jw.Write(format);
jw.WritePropertyName("rate");
jw.Write(rate);
jw.WritePropertyName("channel");
jw.Write(channel);
jw.WritePropertyName("token");
jw.Write(token);
jw.WritePropertyName("cuid");
jw.Write(cuid);
jw.WritePropertyName("len");
jw.Write(len);
jw.WritePropertyName("speech");
jw.Write(speech);
jw.WriteObjectEnd();
WWW getASW = new WWW(url, Encoding.Default.GetBytes(jw.ToString()));
yield return getASW;
if (getASW.isDone)
{
if (getASW.error == null)
{
JsonData getASWJson = JsonMapper.ToObject(getASW.text);
if (getASWJson["err_msg"].ToString() == "success.")
{
audioToString = getASWJson["result"][0].ToString();
if (audioToString.Substring(audioToString.Length - 1) == ",")
audioToString = audioToString.Substring(0, audioToString.Length - 1);
Debug.Log(audioToString);
}
}
else
{
Debug.LogError(getASW.error);
}
}
}
注意窘面,這里不能用WWWForm的AddField方法去上傳參數(shù)翠语,否則會(huì)返回錯(cuò)誤3300,也就是參數(shù)錯(cuò)誤财边,我就是在這卡住了很長時(shí)間肌括。。
好了制圈,現(xiàn)在就可以把一段語音轉(zhuǎn)換成文字然后返回到audioToString這個(gè)字符串了
然后是用unity錄音,這個(gè)腳本在之前的文章中寫過们童,下面在寫一遍
private void Awake()
{
if (GetComponent<AudioSource>() == null)
aud = gameObject.AddComponent<AudioSource>();
else
aud = gameObject.GetComponent<AudioSource>();
aud.playOnAwake = false;
}
/// <summary>
/// 開始錄音
/// </summary>
public void StartMic()
{
if (Microphone.devices.Length == 0) return;
Microphone.End(null);
Debug.Log("Start");
aud.clip = Microphone.Start(null, false, 10, rate);
}
/// <summary>
/// 結(jié)束錄音
/// </summary>
public void EndMic()
{
int lastPos = Microphone.GetPosition(null);
if (Microphone.IsRecording(null))
audioLength = lastPos / rate;//錄音時(shí)長
else
audioLength = 10;
Debug.Log("Stop");
Microphone.End(null);
clipByte = GetClipData();
len = clipByte.Length;
speech = Convert.ToBase64String(clipByte);
StartCoroutine(GetToken(getTokenAPIPath));
}
/// <summary>
/// 把錄音轉(zhuǎn)換為Byte[]
/// </summary>
/// <returns></returns>
public Byte[] GetClipData()
{
if (aud.clip == null)
{
Debug.LogError("錄音數(shù)據(jù)為空");
return null;
}
float[] samples = new float[aud.clip.samples];
aud.clip.GetData(samples, 0);
Byte[] outData = new byte[samples.Length * 2];
int rescaleFactor = 32767; //to convert float to Int16
for (int i = 0; i < samples.Length; i++)
{
short temshort = (short)(samples[i] * rescaleFactor);
Byte[] temdata = System.BitConverter.GetBytes(temshort);
outData[i * 2] = temdata[0];
outData[i * 2 + 1] = temdata[1];
}
if (outData == null || outData.Length <= 0)
{
Debug.LogError("錄音數(shù)據(jù)為空");
return null;
}
return outData;
}
然后再添加一下測試的代碼:
private void OnGUI()
{
if (GUILayout.Button("Start"))
StartMic();
if (GUILayout.Button("End"))
EndMic();
}
public Text debugText;
private void Update()
{
debugText.text = audioToString;
}
整個(gè)腳本就完成了,然后把腳本掛到場景中一個(gè)物體上鲸鹦,再建一個(gè)Text,拖到debugText上慧库,運(yùn)行起來,點(diǎn)Start馋嗜,說一句話齐板,說完再點(diǎn)End,就會(huì)把說的話轉(zhuǎn)成語音輸出到debugText上
我覺得這個(gè)方法還是比較簡單的葛菇,只是必須要聯(lián)網(wǎng)才行甘磨,不過我們這個(gè)程序是要登陸注冊才能進(jìn)去的,所以不存在不聯(lián)網(wǎng)的問題眯停,大家可以參考參考济舆,有好的意見可以提出來共同提高