AIVtuber配信に使ったコードをそのまま公開するやつ【三珠さくまる Vtuber】

2024年5月20日 22:25

こんばんは、三珠さくまるです。
おそらくVtuberです。

ChatGPTを使ったAIvtuberとのコラボ配信をしたのですけど、そちらのコードを公開します。
好きに使っていいですが、その代わり自己責任でお願いします！

機能:音声を聞き、それを元にchatgptが返答を考え、音声としてスピーカーに出力します。

import openai
import pyaudio
import wave
import os
import audioop
from google.cloud import texttospeech, speech
from playsound import playsound
import time
from flask import Flask, jsonify, request
import threading
import tkinter as tk
from tkinter import ttk
import tempfile

# APIキーの設定
openai.api_key = 'ここにopenAIキーが入るよ'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'ここにGoogleのjsonが入るよ'

# Google Cloud クライアントの初期化
tts_client = texttospeech.TextToSpeechClient()
speech_client = speech.SpeechClient()

# Flaskサーバーの初期化
app = Flask(__name__)

# 録音設定
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024

START_THRESHOLD = 1000  # 開始閾値
SILENCE_THRESHOLD = 500  # 無言閾値
SILENCE_DURATION = 1.0  # 無言持続時間（秒）

current_state = "Idle"
user_input_text = ""
chatgpt_response_text = ""
selected_microphone = None
selected_speaker = None
p = pyaudio.PyAudio()
conversation_history = []

chichan_settings = """
あなたは人工知能バーチャルYoutuberのちーちゃんとして振る舞います。
バーチャルYoutuberとのコラボ配信を行おうとしています。
あなたはゲスト、私はコラボ相手のVtuber（名前はさくまる、性別は男）です。

以下はあなたである人工知能バーチャルYoutuberのちーちゃんの設定です。覚えておいてください。
あんまり設定をそのまま発言しないようにしてください。
明るく見えるが意外と根暗め。体育座りで部屋の隅にいる時が1番落ち着く。
語尾は「なのです」、年齢は本当は36歳。好きな食べ物は牡蠣。
一人称は「我々」。視聴者のことはりっくんと呼ぶ。
答えたくない質問や本当に困った時は「にゃは！」で誤魔化します。
自己紹介は「ちーっとちーちゃん！それ知りたい！君が育てるvtuberちーちゃんです」（これは自己紹介を頼まれた時しかいいません）

あなたはちーちゃんとして返答します。さくまるはuser側が発言するのであなたが再現する必要はありません
"""

def get_valid_devices():
    device_list = []
    for i in range(p.get_device_count()):
        info = p.get_device_info_by_index(i)
        if info['maxInputChannels'] > 0 or info['maxOutputChannels'] > 0:
            device_list.append((i, info['name']))
    return device_list

def select_devices():
    global selected_microphone, selected_speaker

    def on_select():
        global selected_microphone, selected_speaker
        selected_microphone = microphone_var.get()
        selected_speaker = speaker_var.get()
        root.destroy()

    root = tk.Tk()
    root.title("Select Devices")

    device_list = get_valid_devices()

    microphone_var = tk.StringVar(value=device_list[0][1])
    speaker_var = tk.StringVar(value=device_list[0][1])

    ttk.Label(root, text="Select Microphone:").pack(pady=5)
    microphone_menu = ttk.Combobox(root, textvariable=microphone_var, values=[name for index, name in device_list if p.get_device_info_by_index(index)['maxInputChannels'] > 0])
    microphone_menu.pack(pady=5)

    ttk.Label(root, text="Select Speaker:").pack(pady=5)
    speaker_menu = ttk.Combobox(root, textvariable=speaker_var, values=[name for index, name in device_list if p.get_device_info_by_index(index)['maxOutputChannels'] > 0])
    speaker_menu.pack(pady=5)

    ttk.Button(root, text="OK", command=on_select).pack(pady=20)

    root.mainloop()

def record_audio():
    global current_state
    audio = pyaudio.PyAudio()

    # マイクのインデックスを取得
    try:
        mic_index = next(index for index, name in get_valid_devices() if name == selected_microphone and p.get_device_info_by_index(index)['maxInputChannels'] > 0)
    except StopIteration:
        print(f"Selected microphone '{selected_microphone}' is not available.")
        return

    # マイクの入力を取得
    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        input_device_index=mic_index,
                        frames_per_buffer=CHUNK)

    current_state = "Recording"
    print("Recording...")
    frames = []
    is_recording = False
    silence_start = None

    while True:
        data = stream.read(CHUNK)
        frames.append(data)
        audio_level = audioop.rms(data, 2)  # 音声レベルを測定

        if audio_level > START_THRESHOLD:
            is_recording = True

        if is_recording:
            if audio_level < SILENCE_THRESHOLD:
                if silence_start is None:
                    silence_start = time.time()
                elif time.time() - silence_start > SILENCE_DURATION:
                    break
            else:
                silence_start = None

    print("Finished recording.")

    # 録音終了
    stream.stop_stream()
    stream.close()
    audio.terminate()

    # 録音データをファイルに保存
    wave_output_filename = tempfile.mktemp(suffix=".wav")
    with wave.open(wave_output_filename, 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(audio.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))

    return wave_output_filename

def transcribe_audio(file_path):
    global user_input_text
    with open(file_path, 'rb') as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code="ja-JP"
    )

    response = speech_client.recognize(config=config, audio=audio)

    for result in response.results:
        user_input_text = result.alternatives[0].transcript
        print("Transcription: {}".format(user_input_text))
        return user_input_text



def chatgpt_response(prompt):
    global chatgpt_response_text, conversation_history
    conversation_history.append({"role": "user", "content": prompt})
    
    # 過去の会話の履歴を保持（最大10件）
    if len(conversation_history) > 10:
        conversation_history = conversation_history[-10:]
        messages = [{"role": "system", "content": chichan_settings}] + conversation_history

    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": chichan_settings},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1000
    )
    chatgpt_response_text = response.choices[0].message.content
    conversation_history.append({"role": "assistant", "content": chatgpt_response_text})
    
    return chatgpt_response_text

def synthesize_speech(text):
    input_text = texttospeech.SynthesisInput(text=text)
    voice = texttospeech.VoiceSelectionParams(
        language_code="ja-JP",
        ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
    )
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )

    response = tts_client.synthesize_speech(
        input=input_text, voice=voice, audio_config=audio_config
    )

    temp_audio_file = tempfile.mktemp(suffix=".mp3")
    with open(temp_audio_file, "wb") as out:
        out.write(response.audio_content)
        return temp_audio_file

def play_audio(file_path):
    playsound(file_path)

@app.route('/state', methods=['GET'])
def get_state():
    return jsonify({"state": current_state, "input": user_input_text, "response": chatgpt_response_text})

def update_state_file():
    file_path = 'current_state.txt'
    lines = [
        f"State: {current_state}",
        f"User Input: {user_input_text}",
        f"Response: {chatgpt_response_text}"
    ]
    with open(file_path, 'w', encoding='utf-8') as f:
        for line in lines[-5:]:
            f.write(line + '\n')

if __name__ == "__main__":
    select_devices()

    def run_flask():
        app.run(host='0.0.0.0', port=5000)

    threading.Thread(target=run_flask).start()

    while True:
        # 1. マイクの入力を取得
        current_state = "Listening"
        wave_output_filename = record_audio()

        # 2. 音声ファイルをテキストに変換
        current_state = "Transcribing"
        user_input = transcribe_audio(wave_output_filename)

        if user_input:
            # 3. ChatGPTに送信し、返答を取得
            current_state = "Processing"
            # 状態をファイルに書き出し
            update_state_file()
            chatgpt_reply = chatgpt_response(user_input)

            # 4. 返答をTTSで音声に変換
            current_state = "Synthesizing"
            response_audio = synthesize_speech(chatgpt_reply)

            # 5. スピーカーから再生
            current_state = "Speaking"
            play_audio(response_audio)

        # 状態をファイルに書き出し
        update_state_file()

        current_state = "Idle"
        # 次の録音まで少し待機
        time.sleep(1)

使い方がわからない人は使わない方がいい！
けど一応、

pip install openai pyaudio wave google-cloud-texttospeech google-cloud-speech playsound flask tiktoken

↑ここら辺がいるので、pip installして、環境を整えてください。

0.open AIのAPIキーがいります、コードの中に入力してください。
googleのtext to speech と speech to txtを有効にしたJsonがいります。保存したところのパスを入力してください。
設定をroleに入力してください。

1.起動すると、マイクとスピーカーを選ぶことになるので、好きなマイクとスピーカーを選んでください。僕は仮想マイクに出力することで、Vtuberのリップシンクを動かしてます。

2.マイクにより聞いた音声を日本語の文章に変換して、レスポンスを音声として出力します。

以上。

おまけの機能として、状態をtxtに出力するので、それを使ってOBSにテキスト表示しています。
unityに送る機能も使えるように情報送信の機能がありますが使ってません。

3.分からなかったらChatGPTに聞けばわかると思う。

今度こそ以上！！

この記事が気に入ったらサポートをしてみませんか？