WHO-5 会話推定 PoC の作り方

ebata

2日前

[OPENAI_API_KEY]

tomoi@DESKTOP-NHQLQU4:~/who5_chat_poc/templates$ export OPENAI_API_KEY="sk-proj-3MmfRzOl2bGkGIuIGk13bLVROGouXb0lqVUr_aSS
nalGq2kAc32tEr3bRlbZ_-LHPdcSmd7O7eT3BlbkFJQtltX7ryPXslPrauvvN0sh0py4YSPZ6WRWPQsyAH9sQpV_OnFOMvc59UbQuI8WUdAePxBCFioA"

[tree]

tomoi@DESKTOP-NHQLQU4:~$ tree who5_chat_poc/
who5_chat_poc/
├── app.py
├── requirements.txt
├── templates
│ └── index.html
└── who5_poc.sqlite3

[venvの環境]

tomoi@DESKTOP-NHQLQU4:~/who5_chat_poc$ python3 -m venv venv
tomoi@DESKTOP-NHQLQU4:~/who5_chat_poc$ source venv/bin/activate

[python app.py]

(venv) tomoi@DESKTOP-NHQLQU4:~/who5_chat_poc$ python app.py
* Serving Flask app 'app'
* Debug mode: on
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
* Running on http://127.0.0.1:5000
Press CTRL+C to quit
* Restarting with stat
* Debugger is active!
* Debugger PIN: 793-572-591

http://localhost:5000/ で起動

=================

[who5_chat_poc/requirements.txt]

flask==3.0.3
openai>=1.40.0
httpx==0.27.2

[who5_chat_poc/app.py]

import os
import json
import uuid
import sqlite3
import tempfile
from pathlib import Path
from datetime import datetime
from typing import Any, Dict

from flask import Flask, request, jsonify, render_template, send_file, after_this_request
from openai import OpenAI

APP_TITLE = "WHO-5 会話推定 PoC"
DB_PATH = os.path.join(os.path.dirname(__file__), "who5_poc.sqlite3")

# 注意：医療診断ではない（PoCの位置づけ）
DISCLAIMER = (
    "本機能は医療的な診断・判定を目的としない推定（目安）です。"
    "異常が疑われる場合は医療機関等へ相談してください。"
)

# ---------------------------
# 音声Q&A（ブラウザが質問→ユーザが音声で回答）
# ---------------------------
INTERVIEW_QUESTIONS = [
    "ここ1週間の生活を一言でいうと、どんな感じですか？",
    "この1週間で、楽しいと感じたことはありましたか？あれば何ですか？",
    "この1週間、落ち着いてリラックスできた時間はどのくらいありましたか？",
    "この1週間、活動的に動けた日はどのくらいありましたか？",
    "睡眠はどうですか？寝つき、途中で目が覚める、起床時の爽快感を教えてください。",
    "この1週間、興味を持って取り組めたことはありましたか？",
    "外出は週に何回くらいですか？買い物や散歩でも構いません。",
    "人と会ったり話したりはどのくらいありましたか？",
]

# セッション状態（PoCなのでメモリ上）
# SESSIONS[session_id] = {"i": 0, "qa": [{"q":..., "a":...}, ...]}
SESSIONS: Dict[str, Dict[str, Any]] = {}

# ---------------------------
# LLM向け指示（JSONのみ出す）
# ---------------------------
SYSTEM_PROMPT = """\
あなたは「会話からWHO-5相当の指標を推定する」アシスタント。
次のルールに厳密に従うこと。

# 目的
ユーザーの自由記述（会話）から、WHO-5の5項目に対応する状態を推定して数値化する。
併せて外出頻度（週あたり）を推定する。

# 出力形式（厳守）
必ず JSON だけを出力する（前後に説明文やコードブロックを付けない）。
JSONスキーマは以下とする：

{
  "who5": {
    "item1": 0-5,
    "item2": 0-5,
    "item3": 0-5,
    "item4": 0-5,
    "item5": 0-5
  },
  "who5_total_0_25": 0-25,
  "who5_score_0_100": 0-100,
  "outing_per_week_est": 0以上の数（整数推奨）,
  "confidence_0_1": 0.0-1.0,
  "evidence": {
    "who5_item1": "根拠（会話の要点を短く）",
    "who5_item2": "...",
    "who5_item3": "...",
    "who5_item4": "...",
    "who5_item5": "...",
    "outing": "..."
  },
  "notes": "不確実性や追加で聞くべき最小質問がある場合は短く書く。なければ空文字。"
}

# WHO-5の項目（対応づけの参考）
item1: 明るく楽しい気分
item2: 落ち着いたリラックス
item3: 意欲的・活動的
item4: よく休めて爽快に目覚め
item5: 日常生活で興味のあることが多い

# 採点ルール（0-5）
0: 全くない / ほぼ該当しない
1: まれに
2: ときどき
3: しばしば
4: ほとんどいつも
5: いつも

# 計算
who5_total_0_25 = item1+...+item5
who5_score_0_100 = who5_total_0_25 * 4

# 注意
- 会話が短く推定が困難なら confidence を下げ、notes に「最小の追加質問」を最大2個だけ書く。
- 推定不能でも JSON 形式は必ず守る。
"""

app = Flask(__name__)
client = OpenAI()


def db_init() -> None:
    with sqlite3.connect(DB_PATH) as con:
        con.execute(
            """
            CREATE TABLE IF NOT EXISTS logs (
              id INTEGER PRIMARY KEY AUTOINCREMENT,
              created_at TEXT NOT NULL,
              user_text TEXT NOT NULL,
              model TEXT NOT NULL,
              result_json TEXT NOT NULL
            )
            """
        )
        con.commit()


def clamp_int(x: Any, lo: int, hi: int) -> int:
    try:
        v = int(round(float(x)))
    except Exception:
        return lo
    return max(lo, min(hi, v))


def clamp_float(x: Any, lo: float, hi: float) -> float:
    try:
        v = float(x)
    except Exception:
        return lo
    return max(lo, min(hi, v))


def normalize_result(obj: Dict[str, Any]) -> Dict[str, Any]:
    who5 = obj.get("who5", {}) or {}
    item1 = clamp_int(who5.get("item1", 0), 0, 5)
    item2 = clamp_int(who5.get("item2", 0), 0, 5)
    item3 = clamp_int(who5.get("item3", 0), 0, 5)
    item4 = clamp_int(who5.get("item4", 0), 0, 5)
    item5 = clamp_int(who5.get("item5", 0), 0, 5)
    total = item1 + item2 + item3 + item4 + item5
    score = total * 4

    outing = clamp_int(obj.get("outing_per_week_est", 0), 0, 50)
    conf = clamp_float(obj.get("confidence_0_1", 0.3), 0.0, 1.0)

    evidence = obj.get("evidence", {}) or {}
    notes = obj.get("notes", "")

    return {
        "who5": {"item1": item1, "item2": item2, "item3": item3, "item4": item4, "item5": item5},
        "who5_total_0_25": total,
        "who5_score_0_100": score,
        "outing_per_week_est": outing,
        "confidence_0_1": conf,
        "evidence": {
            "who5_item1": str(evidence.get("who5_item1", ""))[:120],
            "who5_item2": str(evidence.get("who5_item2", ""))[:120],
            "who5_item3": str(evidence.get("who5_item3", ""))[:120],
            "who5_item4": str(evidence.get("who5_item4", ""))[:120],
            "who5_item5": str(evidence.get("who5_item5", ""))[:120],
            "outing": str(evidence.get("outing", ""))[:120],
        },
        "notes": str(notes)[:200],
        "disclaimer": DISCLAIMER,
    }


def infer_from_text(user_text: str, model: str = "gpt-4.1-mini") -> Dict[str, Any]:
    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_text},
        ],
        temperature=0.2,
    )
    text = (resp.choices[0].message.content or "").strip()

    try:
        obj = json.loads(text)
    except Exception:
        # JSON部分を抜き出し
        start = text.find("{")
        end = text.rfind("}")
        if start != -1 and end != -1 and end > start:
            obj = json.loads(text[start : end + 1])
        else:
            obj = {
                "who5": {"item1": 0, "item2": 0, "item3": 0, "item4": 0, "item5": 0},
                "outing_per_week_est": 0,
                "confidence_0_1": 0.1,
                "evidence": {},
                "notes": "出力がJSONとして取得できませんでした。入力をもう少し具体的にしてください。",
            }

    return normalize_result(obj)


def make_spoken_summary(result: Dict[str, Any]) -> str:
    score = result.get("who5_score_0_100", "-")
    outing = result.get("outing_per_week_est", "-")
    conf = result.get("confidence_0_1", "-")
    notes = (result.get("notes") or "").strip()

    msg = f"推定結果です。WHO-5スコアは{score}点、外出頻度は週{outing}回程度、信頼度は{conf}です。"
    if notes:
        msg += f"補足として、{notes}"
    return msg


@app.get("/")
def index():
    return render_template("index.html", app_title=APP_TITLE)


@app.post("/api/infer")
def api_infer():
    payload = request.get_json(force=True)
    user_text = (payload.get("text") or "").strip()
    if not user_text:
        return jsonify({"error": "text is empty"}), 400

    model = (payload.get("model") or "gpt-4.1-mini").strip()
    result = infer_from_text(user_text, model=model)

    with sqlite3.connect(DB_PATH) as con:
        con.execute(
            "INSERT INTO logs(created_at, user_text, model, result_json) VALUES (?,?,?,?)",
            (datetime.now().isoformat(timespec="seconds"), user_text, model, json.dumps(result, ensure_ascii=False)),
        )
        con.commit()

    return jsonify(result)


# ---------------------------
# 音声Q&A：セッション開始
# ---------------------------
@app.post("/api/session/start")
def api_session_start():
    sid = str(uuid.uuid4())
    SESSIONS[sid] = {"i": 0, "qa": []}
    first_q = INTERVIEW_QUESTIONS[0]
    return jsonify({"session_id": sid, "question": first_q, "done": False})


# ---------------------------
# 音声Q&A：音声回答を受けて次の質問、または最終推定
# ---------------------------
@app.post("/api/session/answer_audio")
def api_session_answer_audio():
    sid = request.form.get("session_id", "").strip()
    if not sid or sid not in SESSIONS:
        return jsonify({"error": "invalid session_id"}), 400

    if "audio" not in request.files:
        return jsonify({"error": "audio file is missing"}), 400

    sess = SESSIONS[sid]
    idx = int(sess.get("i", 0))
    if idx >= len(INTERVIEW_QUESTIONS):
        return jsonify({"error": "session already finished"}), 400

    f = request.files["audio"]
    suffix = Path(f.filename).suffix or ".webm"

    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        f.save(tmp.name)
        tmp_path = tmp.name

    try:
        with open(tmp_path, "rb") as audio_fp:
            tr = client.audio.transcriptions.create(model="whisper-1", file=audio_fp)
        answer_text = (tr.text or "").strip()
    finally:
        try:
            os.remove(tmp_path)
        except Exception:
            pass

    q = INTERVIEW_QUESTIONS[idx]
    sess["qa"].append({"q": q, "a": answer_text})
    sess["i"] = idx + 1

    # 次の質問がある
    if sess["i"] < len(INTERVIEW_QUESTIONS):
        next_q = INTERVIEW_QUESTIONS[sess["i"]]
        return jsonify({"session_id": sid, "transcript": answer_text, "question": next_q, "done": False})

    # 最終推定（全Q&Aをまとめる）
    joined = "\n".join([f"Q: {x['q']}\nA: {x['a']}" for x in sess["qa"]]).strip()
    result = infer_from_text(joined)

    with sqlite3.connect(DB_PATH) as con:
        con.execute(
            "INSERT INTO logs(created_at, user_text, model, result_json) VALUES (?,?,?,?)",
            (datetime.now().isoformat(timespec="seconds"), joined, "interview+infer", json.dumps(result, ensure_ascii=False)),
        )
        con.commit()

    spoken = make_spoken_summary(result)

    return jsonify({"session_id": sid, "transcript": answer_text, "done": True, "result": result, "spoken": spoken})


# ---------------------------
# 単発：音声→文字起こし
# ---------------------------
@app.post("/api/transcribe")
def api_transcribe():
    if "audio" not in request.files:
        return jsonify({"error": "audio file is missing"}), 400

    f = request.files["audio"]
    if not f.filename:
        return jsonify({"error": "filename is empty"}), 400

    suffix = Path(f.filename).suffix or ".webm"
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        f.save(tmp.name)
        tmp_path = tmp.name

    try:
        with open(tmp_path, "rb") as audio_fp:
            tr = client.audio.transcriptions.create(model="whisper-1", file=audio_fp)
        text = (tr.text or "").strip()
        return jsonify({"text": text})
    finally:
        try:
            os.remove(tmp_path)
        except Exception:
            pass


# ---------------------------
# TTS：テキスト→mp3（安定版）
# ---------------------------

@app.post("/api/tts")
def api_tts():
    payload = request.get_json(force=True)
    text = (payload.get("text") or "").strip()
    if not text:
        return jsonify({"error": "text is empty"}), 400

    # 一時ファイルを作り、送信後に削除する
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
        out_path = tmp.name

    @after_this_request
    def _cleanup(response):
        try:
            os.remove(out_path)
        except Exception:
            pass
        return response

    try:
        # SDKのバージョン差を吸収：
        # 1) format 引数が使える版 → format="mp3"
        # 2) format 引数が使えない版 → format を渡さない（mp3がデフォルトのことが多い）
        try:
            speech = client.audio.speech.create(
                model="tts-1",
                voice="alloy",
                input=text,
                format="mp3",
            )
        except TypeError:
            speech = client.audio.speech.create(
                model="tts-1",
                voice="alloy",
                input=text,
            )

        # SDKの戻り型差異を吸収してファイル化
        if hasattr(speech, "stream_to_file"):
            speech.stream_to_file(out_path)
        else:
            data = speech.read() if hasattr(speech, "read") else speech
            with open(out_path, "wb") as f:
                f.write(data)

        return send_file(out_path, mimetype="audio/mpeg", as_attachment=False)

    except Exception as e:
        return jsonify({"error": f"TTS failed: {type(e).__name__}: {e}"}), 500


# ---------------------------
# 単発：録音→推定（自由発話）
# ---------------------------
@app.post("/api/infer_audio")
def api_infer_audio():
    if "audio" not in request.files:
        return jsonify({"error": "audio file is missing"}), 400

    f = request.files["audio"]
    suffix = Path(f.filename).suffix or ".webm"

    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        f.save(tmp.name)
        tmp_path = tmp.name

    try:
        with open(tmp_path, "rb") as audio_fp:
            tr = client.audio.transcriptions.create(model="whisper-1", file=audio_fp)
        user_text = (tr.text or "").strip()
        if not user_text:
            return jsonify({"error": "transcription is empty"}), 400

        result = infer_from_text(user_text)

        with sqlite3.connect(DB_PATH) as con:
            con.execute(
                "INSERT INTO logs(created_at, user_text, model, result_json) VALUES (?,?,?,?)",
                (datetime.now().isoformat(timespec="seconds"), user_text, "whisper-1+infer", json.dumps(result, ensure_ascii=False)),
            )
            con.commit()

        spoken = make_spoken_summary(result)
        return jsonify({"transcript": user_text, "result": result, "spoken": spoken})
    finally:
        try:
            os.remove(tmp_path)
        except Exception:
            pass


if __name__ == "__main__":
    db_init()
    app.run(host="127.0.0.1", port=5000, debug=True)

[who5_chat_poc/templates/index.html]

<!doctype html>
<html lang="ja">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width,initial-scale=1" />
  <title>{{ app_title }}</title>
  <style>
    body { font-family: system-ui, -apple-system, Segoe UI, Roboto, "Hiragino Kaku Gothic ProN", "Noto Sans JP", sans-serif; margin: 24px; }
    textarea { width: 100%; height: 140px; font-size: 14px; }
    button { padding: 10px 14px; font-size: 14px; }
    .row { display:flex; gap:16px; align-items:flex-start; margin-top:16px; }
    .card { border:1px solid #ddd; border-radius:10px; padding:14px; flex:1; }
    .muted { color:#666; font-size: 12px; }
    pre { white-space: pre-wrap; word-break: break-word; background:#fafafa; border:1px solid #eee; padding:10px; border-radius:8px; margin: 6px 0; }
    .score { font-size: 34px; font-weight: 700; }
    .grid { display:grid; grid-template-columns: 1fr 1fr; gap:10px; }
    .pill { display:inline-block; padding:2px 8px; border:1px solid #ddd; border-radius:999px; font-size:12px; color:#444; background:#fff; }
  </style>
</head>
<body>
  <h1>{{ app_title }}</h1>
  <p class="muted">
    目的：自由記述から WHO-5 相当（目安）と外出頻度を推定します（医療診断ではありません）。
  </p>

  <label>日常の様子を、文章で入力してください（例：最近の1日の過ごし方／睡眠／外出／人との会話など）</label>
  <textarea id="text"></textarea>

  <!-- 音声Q&A（ブラウザが質問を読み上げ、回答を録音） -->
  <div style="margin-top:10px; display:flex; gap:10px; align-items:center; flex-wrap:wrap;">
    <button id="startInterview">音声Q&A開始</button>
    <span class="pill" id="modePill">通常</span>
    <span class="muted" id="qStatus"></span>
  </div>
  <div class="muted" style="margin-top:10px;">現在の質問</div>
  <pre id="currentQuestion"></pre>

  <!-- 録音UI（通常モードでもQ&Aモードでも共通で使う） -->
  <div style="margin-top:10px; display:flex; gap:10px; align-items:center;">
    <button id="rec">録音開始</button>
    <button id="stop" disabled>録音停止</button>
    <span class="muted" id="recStatus"></span>
  </div>

  <div class="muted" style="margin-top:10px;">音声入力（文字起こし）</div>
  <pre id="transcriptBox"></pre>

  <div style="margin-top:10px; display:flex; gap:10px; align-items:center;">
    <button id="speak" disabled>結果を読み上げ</button>
    <span class="muted" id="speakStatus"></span>
  </div>

  <!-- テキストで推定（従来機能） -->
  <div style="margin-top:10px; display:flex; gap:10px; align-items:center;">
    <button id="run">推定する</button>
    <span class="muted" id="status"></span>
  </div>

  <div class="row">
    <div class="card">
      <div class="muted">WHO-5 スコア（0?100）</div>
      <div class="score" id="who5score">-</div>

      <div class="grid" style="margin-top:10px;">
        <div>
          <div class="muted">合計（0?25）</div>
          <div id="who5total">-</div>
        </div>
        <div>
          <div class="muted">外出頻度（週）</div>
          <div id="outing">-</div>
        </div>
        <div>
          <div class="muted">信頼度（0?1）</div>
          <div id="conf">-</div>
        </div>
        <div>
          <div class="muted">注意</div>
          <div id="notes">-</div>
        </div>
      </div>

      <div class="muted" style="margin-top:10px;">根拠（要約）</div>
      <pre id="evidence"></pre>
    </div>

    <div class="card">
      <div class="muted">生JSON</div>
      <pre id="raw"></pre>
    </div>
  </div>

<script>
const runBtn = document.getElementById("run");
const statusEl = document.getElementById("status");

function fmt(x) {
  if (x === null || x === undefined) return "-";
  return String(x);
}

function renderResult(obj) {
  document.getElementById("who5score").textContent = fmt(obj.who5_score_0_100);
  document.getElementById("who5total").textContent = fmt(obj.who5_total_0_25);
  document.getElementById("outing").textContent = fmt(obj.outing_per_week_est);
  document.getElementById("conf").textContent = fmt(obj.confidence_0_1);
  document.getElementById("notes").textContent = obj.notes ? obj.notes : "-";

  const ev = obj.evidence || {};
  const lines = [
    `item1: ${ev.who5_item1 || ""}`,
    `item2: ${ev.who5_item2 || ""}`,
    `item3: ${ev.who5_item3 || ""}`,
    `item4: ${ev.who5_item4 || ""}`,
    `item5: ${ev.who5_item5 || ""}`,
    `outing: ${ev.outing || ""}`,
    "",
    `disclaimer: ${obj.disclaimer || ""}`
  ];
  document.getElementById("evidence").textContent = lines.join("\n");
  document.getElementById("raw").textContent = JSON.stringify(obj, null, 2);
}

// ----------------------------
// テキスト推定（従来）
// ----------------------------
runBtn.addEventListener("click", async () => {
  const text = document.getElementById("text").value.trim();
  if (!text) { alert("入力が空です"); return; }

  statusEl.textContent = "推定中…";
  runBtn.disabled = true;

  try {
    const res = await fetch("/api/infer", {
      method: "POST",
      headers: {"Content-Type":"application/json"},
      body: JSON.stringify({text})
    });

    const obj = await res.json();
    if (!res.ok) throw new Error(obj.error || "error");

    renderResult(obj);
    statusEl.textContent = "完了";
  } catch (e) {
    statusEl.textContent = "失敗";
    alert(String(e));
  } finally {
    runBtn.disabled = false;
  }
});

// ----------------------------
// 録音・TTS（通常モード / Q&Aモード共用）
// ----------------------------
let mediaRecorder = null;
let recordedChunks = [];
let lastSpokenText = "";

const recBtn = document.getElementById("rec");
const stopBtn = document.getElementById("stop");
const recStatus = document.getElementById("recStatus");
const transcriptBox = document.getElementById("transcriptBox");
const speakBtn = document.getElementById("speak");
const speakStatus = document.getElementById("speakStatus");

// Q&A UI
const startInterviewBtn = document.getElementById("startInterview");
const qStatus = document.getElementById("qStatus");
const currentQuestion = document.getElementById("currentQuestion");
const modePill = document.getElementById("modePill");

// Q&A状態
let interviewSessionId = "";
let currentQuestionText = "";

// ★重要：多重実行防止フラグ
let interviewBusy = false; // start?質問読み上げ完了までtrue
let ttsBusy = false;       // TTS再生中true

function setModePill() {
  modePill.textContent = interviewSessionId ? "音声Q&A" : "通常";
}

// 音声（mp3）を生成して再生（同時再生禁止）
async function ttsPlay(text) {
  // すでに再生中なら待つ
  while (ttsBusy) {
    await new Promise(r => setTimeout(r, 80));
  }
  ttsBusy = true;

  try {
    const res = await fetch("/api/tts", {
      method: "POST",
      headers: {"Content-Type":"application/json"},
      body: JSON.stringify({ text })
    });

    if (!res.ok) {
      let msg = "";
      const ct = res.headers.get("content-type") || "";
      if (ct.includes("application/json")) {
        const obj = await res.json().catch(() => ({}));
        msg = obj.error || JSON.stringify(obj);
      } else {
        msg = await res.text().catch(() => "");
      }
      throw new Error(msg || `tts error (${res.status})`);
    }

    const blob = await res.blob();
    const url = URL.createObjectURL(blob);
    const audio = new Audio(url);

    await audio.play();
    await new Promise(resolve => audio.onended = resolve);

    try { URL.revokeObjectURL(url); } catch (_) {}
  } finally {
    ttsBusy = false;
  }
}

// Q&A開始（連打・二重起動防止、読み上げ中は録音禁止）
async function startInterview() {
  if (interviewBusy) return;      // 二重起動防止
  if (interviewSessionId) return; // 既に開始済みなら無視

  interviewBusy = true;
  startInterviewBtn.disabled = true; // 連打防止
  recBtn.disabled = true;            // 読み上げが終わるまで録音禁止
  stopBtn.disabled = true;

  try {
    qStatus.textContent = "開始中…";
    currentQuestion.textContent = "";
    currentQuestionText = "";

    const res = await fetch("/api/session/start", { method: "POST" });
    const obj = await res.json().catch(() => ({}));
    if (!res.ok) throw new Error(obj.error || "start failed");

    interviewSessionId = obj.session_id || "";
    currentQuestionText = obj.question || "";
    currentQuestion.textContent = currentQuestionText;

    setModePill();

    qStatus.textContent = "質問読み上げ中…";
    await ttsPlay(currentQuestionText);

    qStatus.textContent = "回答を録音してください（録音開始→録音停止）";
  } catch (e) {
    qStatus.textContent = "失敗";
    alert(String(e));
    interviewSessionId = "";
    setModePill();
  } finally {
    interviewBusy = false;
    startInterviewBtn.disabled = false;
    recBtn.disabled = false;
  }
}

startInterviewBtn.addEventListener("click", async () => {
  try {
    await startInterview();
  } catch (e) {
    // startInterview内で処理しているので、基本ここには来ない
    qStatus.textContent = "失敗";
    alert(String(e));
  }
});

// 通常の「録音→推定」用
async function postAudioInfer(blob) {
  const fd = new FormData();
  fd.append("audio", blob, "recording.webm");

  const res = await fetch("/api/infer_audio", { method: "POST", body: fd });
  const obj = await res.json().catch(() => ({}));
  if (!res.ok) throw new Error(obj.error || "audio api error");
  return obj;
}

// Q&Aの「録音→回答提出」用
async function postAudioAnswer(blob, sessionId) {
  const fd = new FormData();
  fd.append("audio", blob, "recording.webm");
  fd.append("session_id", sessionId);

  const res = await fetch("/api/session/answer_audio", { method: "POST", body: fd });
  const obj = await res.json().catch(() => ({}));
  if (!res.ok) throw new Error(obj.error || "answer_audio error");
  return obj;
}

// 録音開始
recBtn.addEventListener("click", async () => {
  // TTS再生中は録音させない（質問と回答が混ざるのを防ぐ）
  if (ttsBusy) {
    alert("読み上げ中です。読み上げが終わってから録音してください。");
    return;
  }

  recordedChunks = [];
  transcriptBox.textContent = "";
  recStatus.textContent = "マイク取得中…";

  let stream;
  try {
    stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  } catch (e) {
    recStatus.textContent = "失敗";
    alert("マイクが取得できませんでした。ブラウザの権限を確認してください。");
    return;
  }

  try {
    mediaRecorder = new MediaRecorder(stream);
  } catch (e) {
    recStatus.textContent = "失敗";
    alert("MediaRecorderが利用できません。別ブラウザで試してください。");
    try { stream.getTracks().forEach(t => t.stop()); } catch (_) {}
    return;
  }

  mediaRecorder.ondataavailable = (e) => {
    if (e.data && e.data.size > 0) recordedChunks.push(e.data);
  };

  mediaRecorder.onstart = () => {
    recStatus.textContent = "録音中…";
    recBtn.disabled = true;
    stopBtn.disabled = false;
    // Q&A開始中は、開始ボタンも押せないように（念のため）
    startInterviewBtn.disabled = true;
  };

  mediaRecorder.onstop = async () => {
    recStatus.textContent = "送信中…";
    stopBtn.disabled = true;

    const blob = new Blob(recordedChunks, { type: "audio/webm" });

    try {
      // ----------------------------
      // Q&A中なら answer_audio、通常なら infer_audio
      // ----------------------------
      if (interviewSessionId) {
        const obj = await postAudioAnswer(blob, interviewSessionId);

        transcriptBox.textContent = obj.transcript || "";
        // textareaにも反映（任意）
        const textArea = document.getElementById("text");
        textArea.value = obj.transcript || textArea.value;

        if (obj.done) {
          // 最終結果
          const r = obj.result;
          renderResult(r);

          lastSpokenText = obj.spoken || "";
          speakBtn.disabled = !lastSpokenText;

          currentQuestion.textContent = "終了しました。";
          qStatus.textContent = "終了（必要なら『結果を読み上げ』）";

          // Q&Aモード解除
          interviewSessionId = "";
          setModePill();

          recStatus.textContent = "完了";
          return;
        } else {
          // 次の質問
          currentQuestionText = obj.question || "";
          currentQuestion.textContent = currentQuestionText;

          qStatus.textContent = "質問読み上げ中…";
          recBtn.disabled = true; // ★読み上げ中は録音禁止
          stopBtn.disabled = true;

          await ttsPlay(currentQuestionText);

          recBtn.disabled = false; // ★読み上げ後に録音可
          qStatus.textContent = "回答を録音してください（録音開始→録音停止）";

          recStatus.textContent = "完了";
          return; // 通常処理へ落とさない
        }
      }

      // ----------------------------
      // 通常モード：録音→推定
      // ----------------------------
      const obj = await postAudioInfer(blob);

      transcriptBox.textContent = obj.transcript || "";
      lastSpokenText = obj.spoken || "";

      // textareaにも反映（任意）
      const textArea = document.getElementById("text");
      textArea.value = obj.transcript || textArea.value;

      // 推定結果
      const r = obj.result;
      renderResult(r);

      speakBtn.disabled = !lastSpokenText;
      recStatus.textContent = "完了";
    } catch (e) {
      recStatus.textContent = "失敗";
      alert(String(e));
    } finally {
      // ストリーム停止
      try { mediaRecorder.stream.getTracks().forEach(t => t.stop()); } catch (_) {}
      recBtn.disabled = false;
      startInterviewBtn.disabled = false;
    }
  };

  mediaRecorder.start();
});

// 録音停止
stopBtn.addEventListener("click", () => {
  if (mediaRecorder && mediaRecorder.state !== "inactive") {
    mediaRecorder.stop();
  }
});

// 結果読み上げ
speakBtn.addEventListener("click", async () => {
  if (!lastSpokenText) return;
  speakStatus.textContent = "音声生成中…";
  speakBtn.disabled = true;
  recBtn.disabled = true; // 読み上げ中の録音禁止

  try {
    await ttsPlay(lastSpokenText);
    speakStatus.textContent = "完了";
  } catch (e) {
    speakStatus.textContent = "失敗";
    alert(String(e));
  } finally {
    speakBtn.disabled = false;
    recBtn.disabled = false;
  }
});

// 初期表示
setModePill();
qStatus.textContent = "音声Q&Aは『音声Q&A開始』から開始します。";
currentQuestion.textContent = "（未開始）";
</script>
</body>
</html>