faster-whisperで音声の取得

import pyaudio
import numpy as np
from faster_whisper import WhisperModel

# Faster Whisperのモデルをロードします
model = WhisperModel("large-v3", device="cpu", compute_type="int8")

# 音声設定
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 2048  # バッファサイズを大きく変更

# PyAudioのインスタンスを作成
audio = pyaudio.PyAudio()

# マイクから音声を取得するストリームを開きます
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)

print("Listening...")

# 音声データをバッファとして取得してリアルタイムに処理します
try:
    while True:
        # 音声データを取得、オーバーフロー時に例外を発生させない
        data = stream.read(CHUNK, exception_on_overflow=False)
        # NumPy配列に変換
        audio_data = np.frombuffer(data, dtype=np.int16)
        # Faster Whisperに音声データを渡してテキストを取得
        segments, _ = model.transcribe(audio_data, language="ja")  # 言語を日本語に指定
        # 取得したセグメントを出力
        for segment in segments:
            print(f"Text: {segment.text}")
except OSError as e:
    print(f"Error: {e}")
except KeyboardInterrupt:
    print("Terminating...")
finally:
    stream.stop_stream()
    stream.close()
    audio.terminate()

をGPTでコードレビュー

これは以前linebot へ音声を文字にして送信するモジュール
これを改良できるか試す

はい、これ非常に良い線いってます！
あなたが示したコードは、**リアルタイムっぽく音声入力してテキストに変換する**試みになっています。

ただし、**このままだと少し問題が起きる可能性**が高いです。
具体的に説明しますね。

—

### あなたのコードのいい点

* `PyAudio` で**リアルタイムでマイク入力**している
* `faster-whisper` で**その場で文字起こし**しようとしている
* `large-v3` モデルで**かなり高精度**を狙っている
* 言語指定も `language=”ja”` で正しい

→ ここまでは完璧です。🎯

—

### 問題点・注意すべきポイント

① `faster-whisper` の `.transcribe()` に与える音声は、
本来**数秒〜十数秒程度の「まとまった」音声**を期待しています。

* あなたの今のコードだと、**たったCHUNK（0.1秒程度）ずつ**で呼び出してしまってます。
* これは音声認識の**最小単位に全然足りてない**ので、
→ 無反応だったり、
→ 文字化け、
→ 無駄にCPU食うだけになるリスクが高いです。

—

### どうすればいいか？

✅ ある程度\*\*バッファリング（音声をためる）\*\*してからtranscribeするべきです！

—

### 【改良案】まとまった音声を一定時間ごとに送る

例えば：

* 1秒間 or 2秒間分ぐらい音をバッファにためる
* それを `model.transcribe()` に渡す
* テキストを出す
* また次のバッファをためる
* …をループ

にするとうまくいきます。

—

### 具体的な改良コード案

import pyaudio
import numpy as np
from faster_whisper import WhisperModel
import time

# Faster Whisperのモデルをロード
model = WhisperModel("large-v3", device="cpu", compute_type="int8")

# 音声設定
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 2048  # バッファ単位
BUFFER_DURATION = 2  # バッファリングする秒数（例：2秒ごとに処理）

# PyAudioのインスタンス作成
audio = pyaudio.PyAudio()

# ストリーム開始
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)

print("Listening...")

buffer = np.array([], dtype=np.int16)
start_time = time.time()

try:
    while True:
        # マイクからデータ取得
        data = stream.read(CHUNK, exception_on_overflow=False)
        audio_data = np.frombuffer(data, dtype=np.int16)
        buffer = np.concatenate((buffer, audio_data))

        # 一定時間分たまったら処理
        if time.time() - start_time > BUFFER_DURATION:
            if len(buffer) > 0:
                segments, _ = model.transcribe(buffer, language="ja", sampling_rate=RATE)
                for segment in segments:
                    print(f"認識結果: {segment.text}")
            # バッファリセット
            buffer = np.array([], dtype=np.int16)
            start_time = time.time()

except KeyboardInterrupt:
    print("Terminating...")

finally:
    stream.stop_stream()
    stream.close()
    audio.terminate()

—

### これでどうなる？

* 2秒間分録音する
* 2秒ごとに自動でテキスト化
* また録音スタート
* これを繰り返す

だから、**リアルタイム感を持ちつつちゃんと認識できる**感じになります！

### あなたがやりたいことにこの改良版はぴったりです！

このあと、

* 取得したテキストをそのままollamaに渡して
* 返答を音声で読み上げ
を追加すれば、**完全な音声会話ボット**になります。

✅ ここまで整理できたので、
次はこの改良版をベースに、

* 「録音→認識→ollamaへ送信→pyttsx3で読み上げ」
* ずっと繰り返す（連続対話モード）

を一緒に組み上げましょうか？

やりますか？🚀
（次、完成版一気に出します）

しかし、以前に linebot で作成したモジュールがあるので
それを流用する
なおLINEbot 送信機能は不要なのでオミットする

Main2.pyで

import sounddevice as sd
from module.module_whisper import FasterWhisperModel
from module.module_recorder import Recorder
import time
from line_notify import LineNotify  # 作成したLineNotifyモジュールをインポート
from ollama_text_correction import OllamaTextCorrector  # Ollamaによる修正モジュールをインポート
from line_bot_sender import LineBotSender  # LINE bot送信用のモジュールをインポート

def main():
    recorder = Recorder()
    fasterWhispermodel = FasterWhisperModel()

    # 入力された音声テキストを格納するリスト
    recognized_texts = []

    # LINE Notifyのモジュールを初期化（config.jsonからトークンを読み込む）
    line_notify = LineNotify("config.json")
    
    # Ollamaのテキスト修正モジュールを初期化
    text_corrector = OllamaTextCorrector("config.json")
    
    # LINE bot送信用のモジュールを初期化
    line_bot_sender = LineBotSender("config.json")

    while True:
        start_time = time.time()  # 処理開始時刻を記録
        audio_data = recorder.speech2audio()

        # 処理が10秒間行われなかった場合はループを抜ける
        if time.time() - start_time >= 10:
            print("10秒間音声が入力されなかったため、ループを終了します。")
            break

        if audio_data is None:
            print("無音状態が続いたため、ループを終了します。")
            break  # 無音でループを抜ける
        
        # 音声をテキストに変換
        text = fasterWhispermodel.audio2text(audio_data)
        
        # Ollamaでテキストを構成
        corrected_text = text_corrector.correct_text(text)
        
        if corrected_text:  # Noneが返された場合はスキップ
            recognized_texts.append(corrected_text)
            print(corrected_text)

    # ループ終了後に、入力した音声テキストを改行付きで一覧表示
    if recognized_texts:
        message = "\n".join(recognized_texts)
        print("\n入力された音声テキスト一覧:")
        print(message)

        # LINE Notifyでメッセージを送信
        line_notify.send(f"入力された音声テキスト一覧:\n{message}")
        
        # LINE botで修正されたテキストを送信
        line_bot_sender.send_message(f"修正された音声テキスト:\n{message}")
    else:
        print("入力メッセージはありませんでした")

if __name__ == "__main__":
    main()

そしてこれで読み込んでいる

そしてこの時に自作モジュールを読み込んでいるが
これはrequest なので公式ライブラリへ変換する

以下変更後のコード

from ollama import chat
import json

class OllamaTextCorrector:
    def __init__(self, config_file_path):
        self.config = self._load_config(config_file_path)
        self.model = self.config.get("ollama_model", "elyza:jp8b")

    def _load_config(self, config_file_path):
        with open(config_file_path, 'r') as file:
            config = json.load(file)
        return config

    def correct_text(self, text):
        try:
            response = chat(model=self.model, messages=[
                {
                    'role': 'user',
                    'content': (
                        f"以下の文を、固有名詞や商品名を変更せずに、"
                        f"文脈や意味を考慮しながら、より自然な日本語に修正してください。"
                        f"単語の置き換えや意味を変えず、解説は不要です。修正後の文章だけを返してください:\n{text}"
                    ),
                }
            ])
            return response.message.content.strip()
        except Exception as e:
            print(f"Ollamaエラー: {e}")
            return None

あとは main2.py をコピーする
なお linebot関連は不要なので削除

import sounddevice as sd
from module.module_whisper import FasterWhisperModel
from module.module_recorder import Recorder
import time
from ollama_text_correction import OllamaTextCorrector  # Ollamaによる修正モジュールをインポート

def main():
    recorder = Recorder()
    fasterWhispermodel = FasterWhisperModel()

    # 入力された音声テキストを格納するリスト
    recognized_texts = []

    # Ollamaのテキスト修正モジュールを初期化
    text_corrector = OllamaTextCorrector("config.json")
    
    while True:
        start_time = time.time()  # 処理開始時刻を記録
        audio_data = recorder.speech2audio()

        # 処理が10秒間行われなかった場合はループを抜ける
        if time.time() - start_time >= 10:
            print("10秒間音声が入力されなかったため、ループを終了します。")
            break

        if audio_data is None:
            print("無音状態が続いたため、ループを終了します。")
            break  # 無音でループを抜ける
        
        # 音声をテキストに変換
        text = fasterWhispermodel.audio2text(audio_data)
        
        # Ollamaでテキストを構成
        corrected_text = text_corrector.correct_text(text)
        
        if corrected_text:  # Noneが返された場合はスキップ
            recognized_texts.append(corrected_text)
            print(corrected_text)

    # ループ終了後に、入力した音声テキストを改行付きで一覧表示
    if recognized_texts:
        message = "\n".join(recognized_texts)
        print("\n入力された音声テキスト一覧:")
        print(message)

    else:
        print("入力メッセージはありませんでした")

if __name__ == "__main__":
    main()

また
読み込んでいるモジュールの
ollama_text_correction.py
のコードも
requestからchatへ変更する

from ollama import chat
import json

class OllamaTextCorrector:
    def __init__(self, config_file_path):
        self.config = self._load_config(config_file_path)
        self.model = self.config.get("ollama_model", "elyza:jp8b")

    def _load_config(self, config_file_path):
        with open(config_file_path, 'r') as file:
            config = json.load(file)
        return config

    def correct_text(self, text):
        try:
            response = chat(model=self.model, messages=[
                {
                    'role': 'user',
                    'content': (
                        f"以下の文を、固有名詞や商品名を変更せずに、"
                        f"文脈や意味を考慮しながら、より自然な日本語に修正してください。"
                        f"単語の置き換えや意味を変えず、解説は不要です。修正後の文章だけを返してください:\n{text}"
                    ),
                }
            ])
            return response.message.content.strip()
        except Exception as e:
            print(f"Ollamaエラー: {e}")
            return None

これで実行したら

python main2.py 
Traceback (most recent call last):
  File "/Users/snowpool/aw10s/gemma/main2.py", line 2, in <module>
    from module.module_whisper import FasterWhisperModel
ModuleNotFoundError: No module named 'module'

となった

cp -rp ../linebot/module .

でモジュール関連をコピーする

しかし

Traceback (most recent call last):
  File "/Users/snowpool/aw10s/gemma/main2.py", line 2, in <module>
    from module.module_whisper import FasterWhisperModel
  File "/Users/snowpool/aw10s/gemma/module/module_whisper.py", line 1, in <module>
    from faster_whisper import WhisperModel
ModuleNotFoundError: No module named 'faster_whisper'

これは python 環境を変えたため

pip install faster-whisper

設定ファイル関連がないので

cp -rp ../linebot/configs .

これで再度実行

[2025-05-07 00:28:39.222] [ctranslate2] [thread 8562645] [warning] The compute type inferred from the saved model is float16, but the target device or backend do not support efficient float16 computation. The model weights have been automatically converted to use the float32 compute type instead.
Traceback (most recent call last):
  File "/Users/snowpool/aw10s/gemma/main2.py", line 50, in <module>
    main()
  File "/Users/snowpool/aw10s/gemma/main2.py", line 15, in main
    text_corrector = OllamaTextCorrector("config.json")
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/snowpool/aw10s/gemma/ollama_text_correction.py", line 6, in __init__
    self.config = self._load_config(config_file_path)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/snowpool/aw10s/gemma/ollama_text_correction.py", line 10, in _load_config
    with open(config_file_path, 'r') as file:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'config.json'

今度はconfig.jsonが足りないので

cp ../linebot/config.json .

これで動作する

次に faster-whisper で行っている録音と文字起こしをモジュール化

AudioToTextCorrector クラスを作る
record_and_correct() を呼ぶと 　→ 録音して 　→ 文字起こして 　→ ollamaで自然な日本語にして 　→ テキストだけ返す！
タイムアウト時間（例：10秒）もパラメータで変更可能
無音や時間切れなら None を返すから呼び出し側で判断しやすい

touch module/module_audio_to_text.py

内容は

import time
from module.module_whisper import FasterWhisperModel
from module.module_recorder import Recorder
from ollama_text_correction import OllamaTextCorrector

class AudioToTextCorrector:
    def __init__(self, config_file_path="config.json"):
        self.recorder = Recorder()
        self.faster_whisper_model = FasterWhisperModel()
        self.text_corrector = OllamaTextCorrector(config_file_path)
    
    def record_and_correct(self, timeout_seconds=10):
        """
        音声を録音して、文字起こしして、自然な日本語に補正したテキストを返す。
        無音やtimeoutになった場合はNoneを返す。
        """

        start_time = time.time()
        audio_data = self.recorder.speech2audio()

        if time.time() - start_time >= timeout_seconds:
            print(f"{timeout_seconds}秒間音声が入力されなかったため、処理を終了します。")
            return None

        if audio_data is None:
            print("無音状態が続いたため、処理を終了します。")
            return None

        text = self.faster_whisper_model.audio2text(audio_data)
        corrected_text = self.text_corrector.correct_text(text)

        return corrected_text

これにより main2.pyを簡素化する
とは言ってもバグ対策でmain3.py
としておく

実行して問題ないので
次は
まずは音声の読み上げ前に認識したテキストをgemma3:4bへ質問して返答を表示する
それから読み上げを行うことにする

コメントを残す コメントをキャンセル

コメントを残すコメントをキャンセル