1 月之前 · 27e5b85adc
--- a/speech_tts_onnx.py
+++ b/speech_tts_onnx.py
@@ -17,17 +17,20 @@ from typing import Dict, List, Optional, Tuple
 
				 
			
 
				 import numpy as np
			
 
				 import soundfile as sf
			
 
				+import aiofiles
			
 
				 from fastapi import Body, FastAPI, HTTPException, Query, Request
			
 
				 from fastapi.middleware.cors import CORSMiddleware
			
 
				 from fastapi.responses import StreamingResponse
			
 
				 from pydantic import BaseModel, field_validator
			
 
				+from cachetools import TTLCache
			
 
				+import concurrent.futures
			
 
				 
			
 
				 
			
 
				 # ============================================================
			
 
				-# 配置
			
 
				+# 配置 model_quantized
			
 
				 # ============================================================
			
 
				 
			
 
				-DEFAULT_MODEL_NAME = os.getenv("TTS_ONNX_MODEL_NAME", "model_quantized.onnx")
			
 
				+DEFAULT_MODEL_NAME = os.getenv("TTS_ONNX_MODEL_NAME", "model.onnx")
			
 
				 MODEL_DIR = os.getenv("TTS_ONNX_MODEL_DIR", "/home/tts-server/onnx")
			
 
				 HF_MODEL_ID = os.getenv("TTS_ONNX_HF_MODEL_ID", "onnx-community/Kokoro-82M-ONNX")
			
 
				 TOKENIZER_PATH = os.getenv("TTS_ONNX_TOKENIZER_PATH", str(Path(MODEL_DIR) / "tokenizer.json"))
			
@@ -40,6 +43,11 @@ MAX_CONCURRENT_REQUESTS = int(os.getenv("MAX_CONCURRENT_REQUESTS", 8))
 
				 MEMORY_CACHE_SIZE = int(os.getenv("MEMORY_CACHE_SIZE", 200))
			
 
				 DISK_CACHE_SIZE = int(os.getenv("DISK_CACHE_SIZE", 500))
			
 
				 DEFAULT_SAMPLE_RATE = int(os.getenv("TTS_SAMPLE_RATE", 24000))
			
 
				+MEMORY_CACHE_TTL = int(os.getenv("MEMORY_CACHE_TTL", 18000))
			
 
				+ORT_INTRA_OP_THREADS = int(os.getenv("ORT_INTRA_OP_THREADS", "2"))
			
 
				+ORT_INTER_OP_THREADS = int(os.getenv("ORT_INTER_OP_THREADS", "1"))
			
 
				+ORT_ENABLE_CPU_MEM_ARENA = os.getenv("ORT_ENABLE_CPU_MEM_ARENA", "true").lower() == "true"
			
 
				+ORT_ENABLE_MEM_PATTERN = os.getenv("ORT_ENABLE_MEM_PATTERN", "true").lower() == "true"
			
 
				 VOICE_ALIASES = {}
			
 
				 
			
 
				 logging.basicConfig(level=getattr(logging, LOG_LEVEL, logging.INFO))
			
@@ -67,8 +75,11 @@ _KOKORO_ONNX_ENGINE = None
 
				 
			
 
				 model_lock = threading.Lock()
			
 
				 synthesis_lock = threading.Lock()
			
 
				+cache_lock = threading.Lock()
			
 
				 request_semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
			
 
				 current_requests: Dict[str, Dict] = {}
			
 
				+memory_cache = TTLCache(maxsize=MEMORY_CACHE_SIZE, ttl=MEMORY_CACHE_TTL)
			
 
				+executor = concurrent.futures.ThreadPoolExecutor(max_workers=8)
			
 
				 
			
 
				 model_session = None
			
 
				 model_name = DEFAULT_MODEL_NAME
			
@@ -96,6 +107,16 @@ def meta_cache_path(key: str) -> str:
 
				     return os.path.join(CACHE_DIR, f"{key}.json")
			
 
				 
			
 
				 
			
 
				+def put_memory_cache(key: str, value: dict):
			
 
				+    with cache_lock:
			
 
				+        memory_cache[key] = value
			
 
				+
			
 
				+
			
 
				+def get_memory_cache(key: str) -> Optional[dict]:
			
 
				+    with cache_lock:
			
 
				+        return memory_cache.get(key)
			
 
				+
			
 
				+
			
 
				 def to_mono_numpy(audio) -> np.ndarray:
			
 
				     if audio is None:
			
 
				         return np.array([], dtype=np.float32)
			
@@ -146,6 +167,71 @@ def read_wav_bytes(wav_path: str) -> Tuple[bytes, int]:
 
				     return buf.getvalue(), sr
			
 
				 
			
 
				 
			
 
				+async def load_sentence_from_disk(key: str) -> Optional[dict]:
			
 
				+    wav_path = sentence_cache_path(key)
			
 
				+    meta_path = meta_cache_path(key)
			
 
				+    if not Path(wav_path).exists() or not Path(meta_path).exists():
			
 
				+        return None
			
 
				+
			
 
				+    async with aiofiles.open(meta_path, "r") as f:
			
 
				+        meta_text = await f.read()
			
 
				+        meta = json.loads(meta_text)
			
 
				+
			
 
				+    wav_bytes, sr = await asyncio.get_event_loop().run_in_executor(
			
 
				+        executor, lambda: read_wav_bytes(wav_path)
			
 
				+    )
			
 
				+    meta["audio_bytes"] = wav_bytes
			
 
				+    meta["audio"] = base64.b64encode(wav_bytes).decode()
			
 
				+    meta["sample_rate"] = sr
			
 
				+    return meta
			
 
				+
			
 
				+
			
 
				+async def save_sentence_to_disk(key: str, audio: np.ndarray, sr: int, sentence: str):
			
 
				+    wav_path = sentence_cache_path(key)
			
 
				+    meta_path = meta_cache_path(key)
			
 
				+
			
 
				+    await asyncio.get_event_loop().run_in_executor(
			
 
				+        executor, lambda: sf.write(wav_path, audio, sr, format="WAV")
			
 
				+    )
			
 
				+
			
 
				+    async with aiofiles.open(meta_path, "w") as f:
			
 
				+        await f.write(json.dumps({"sentence": sentence, "sample_rate": sr}))
			
 
				+
			
 
				+
			
 
				+async def clean_disk_cache():
			
 
				+    files = sorted(Path(CACHE_DIR).glob("*.wav"), key=os.path.getmtime)
			
 
				+    if len(files) <= DISK_CACHE_SIZE:
			
 
				+        return
			
 
				+    remove_n = len(files) - DISK_CACHE_SIZE
			
 
				+    for f in files[:remove_n]:
			
 
				+        try:
			
 
				+            f.unlink()
			
 
				+            json_path = meta_cache_path(f.stem)
			
 
				+            if Path(json_path).exists():
			
 
				+                Path(json_path).unlink()
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+
			
 
				+
			
 
				+def encode_wav_bytes(audio: np.ndarray, sr: int) -> bytes:
			
 
				+    buf = io.BytesIO()
			
 
				+    with sf.SoundFile(
			
 
				+        buf,
			
 
				+        "w",
			
 
				+        samplerate=sr,
			
 
				+        channels=audio.shape[1] if audio.ndim > 1 else 1,
			
 
				+        format="WAV",
			
 
				+        subtype="PCM_16",
			
 
				+    ) as f:
			
 
				+        f.write(audio)
			
 
				+    return buf.getvalue()
			
 
				+
			
 
				+
			
 
				+def wav_to_pcm_payload(wav_bytes: bytes) -> bytes:
			
 
				+    data, _ = sf.read(io.BytesIO(wav_bytes), dtype="int16")
			
 
				+    return np.asarray(data, dtype=np.int16).tobytes()
			
 
				+
			
 
				+
			
 
				 # ============================================================
			
 
				 # 模型加载
			
 
				 # ============================================================
			
@@ -267,7 +353,12 @@ def load_onnx_session(name: str):
 
				 
			
 
				     model_path = resolve_model_path(name)
			
 
				     providers = ["CPUExecutionProvider"]
			
 
				-    session = ort.InferenceSession(model_path, providers=providers)
			
 
				+    sess_options = ort.SessionOptions()
			
 
				+    sess_options.intra_op_num_threads = ORT_INTRA_OP_THREADS
			
 
				+    sess_options.inter_op_num_threads = ORT_INTER_OP_THREADS
			
 
				+    sess_options.enable_cpu_mem_arena = ORT_ENABLE_CPU_MEM_ARENA
			
 
				+    sess_options.enable_mem_pattern = ORT_ENABLE_MEM_PATTERN
			
 
				+    session = ort.InferenceSession(model_path, sess_options=sess_options, providers=providers)
			
 
				     logger.info("ONNX 模型加载完成: %s", model_path)
			
 
				     return session
			
 
				 
			
@@ -518,30 +609,68 @@ def synthesize_audio(text: str, voice: str, speed: float, model_name: Optional[s
 
				 
			
 
				 
			
 
				 def synthesize_wav_bytes(text: str, voice: str, speed: float, model_name: Optional[str] = None) -> io.BytesIO:
			
 
				-    segments: List[np.ndarray] = []
			
 
				-    if _is_english_voice(voice):
			
 
				-        parts = [graphemes for graphemes, _ in _phonemize_en_chunks_for_onnx(text)] if text.strip() else []
			
 
				-    else:
			
 
				-        parts = split_sentences(text) if text.strip() else []
			
 
				-    if not parts:
			
 
				-        parts = [text.strip()]
			
 
				-
			
 
				-    with synthesis_lock:
			
 
				-        for part in parts:
			
 
				-            audio = synthesize_audio(part, voice=voice, speed=speed, model_name=model_name)
			
 
				-            if audio.size > 0:
			
 
				-                segments.append(audio)
			
 
				-
			
 
				-    if not segments:
			
 
				-        raise HTTPException(status_code=400, detail="未生成音频，请检查输入文本或参数。")
			
 
				-
			
 
				-    audio_concat = np.concatenate(segments, axis=0)
			
 
				+    audio = synthesize_audio(text=text, voice=voice, speed=speed, model_name=model_name)
			
 
				     buf = io.BytesIO()
			
 
				-    sf.write(buf, audio_concat, samplerate=sample_rate, format="WAV", subtype="PCM_16")
			
 
				+    sf.write(buf, audio, samplerate=sample_rate, format="WAV", subtype="PCM_16")
			
 
				     buf.seek(0)
			
 
				     return buf
			
 
				 
			
 
				 
			
 
				+def iter_text_parts(text: str, split_pattern: Optional[str] = None) -> List[str]:
			
 
				+    text = (text or "").strip()
			
 
				+    if not text:
			
 
				+        return []
			
 
				+
			
 
				+    blocks = [text]
			
 
				+    if split_pattern:
			
 
				+        try:
			
 
				+            blocks = [part.strip() for part in re.split(split_pattern, text) if part.strip()]
			
 
				+        except re.error:
			
 
				+            logger.warning("split_pattern 非法，回退默认分句: %s", split_pattern)
			
 
				+
			
 
				+    parts: List[str] = []
			
 
				+    for block in blocks:
			
 
				+        sentences = split_sentences(block)
			
 
				+        if sentences:
			
 
				+            parts.extend(sentences)
			
 
				+        elif block:
			
 
				+            parts.append(block)
			
 
				+    return parts
			
 
				+
			
 
				+
			
 
				+async def get_or_create_sentence_cache_item(
			
 
				+    sentence: str,
			
 
				+    voice: str,
			
 
				+    speed: float,
			
 
				+    model_name: str,
			
 
				+) -> dict:
			
 
				+    key = sentence_cache_key(sentence, voice, speed, model_name)
			
 
				+    cached_item = get_memory_cache(key)
			
 
				+    if cached_item:
			
 
				+        return cached_item
			
 
				+
			
 
				+    disk_item = await load_sentence_from_disk(key)
			
 
				+    if disk_item:
			
 
				+        put_memory_cache(key, disk_item)
			
 
				+        return disk_item
			
 
				+
			
 
				+    def _synthesize_item():
			
 
				+        audio = synthesize_audio(sentence, voice=voice, speed=speed, model_name=model_name)
			
 
				+        wav_bytes = encode_wav_bytes(audio, sample_rate)
			
 
				+        return {
			
 
				+            "sentence": sentence,
			
 
				+            "sample_rate": sample_rate,
			
 
				+            "audio_bytes": wav_bytes,
			
 
				+            "audio": base64.b64encode(wav_bytes).decode(),
			
 
				+        }, audio
			
 
				+
			
 
				+    item, audio = await asyncio.to_thread(_synthesize_item)
			
 
				+    put_memory_cache(key, item)
			
 
				+    await save_sentence_to_disk(key, audio, sample_rate, sentence)
			
 
				+    await clean_disk_cache()
			
 
				+    return item
			
 
				+
			
 
				+
			
 
				 @app.post("/tts", summary="POST: 传入文本返回 WAV 流")
			
 
				 def tts_post(req: TTSRequest):
			
 
				     buf = synthesize_wav_bytes(
			
@@ -591,12 +720,7 @@ async def generate_audio_stream(data: Dict = Body(...)):
 
				         if not text.strip():
			
 
				             raise HTTPException(status_code=400, detail="文本不能为空")
			
 
				 
			
 
				-        if _is_english_voice(voice):
			
 
				-            parts = [graphemes for graphemes, _ in _phonemize_en_chunks_for_onnx(text)] if text.strip() else []
			
 
				-        else:
			
 
				-            parts = split_sentences(text) if text.strip() else []
			
 
				-        if not parts:
			
 
				-            parts = [text.strip()]
			
 
				+        parts = iter_text_parts(text, data.get("split_pattern", r"\n+"))
			
 
				 
			
 
				         if client_id in current_requests:
			
 
				             current_requests[client_id]["interrupt"] = True
			
@@ -607,37 +731,17 @@ async def generate_audio_stream(data: Dict = Body(...)):
 
				         async def stream():
			
 
				             try:
			
 
				                 for idx, part in enumerate(parts):
			
 
				-                    if not part:
			
 
				-                        continue
			
 
				                     if current_requests.get(client_id, {}).get("interrupt"):
			
 
				                         break
			
 
				 
			
 
				-                    audio = await asyncio.to_thread(
			
 
				-                        synthesize_audio,
			
 
				-                        text=part,
			
 
				-                        voice=voice,
			
 
				-                        speed=speed,
			
 
				-                        model_name=model,
			
 
				-                    )
			
 
				-
			
 
				-                    with io.BytesIO() as buf:
			
 
				-                        with sf.SoundFile(
			
 
				-                            buf,
			
 
				-                            "w",
			
 
				-                            sample_rate,
			
 
				-                            channels=audio.shape[1] if audio.ndim > 1 else 1,
			
 
				-                            format="WAV",
			
 
				-                            subtype="PCM_16",
			
 
				-                        ) as f:
			
 
				-                            f.write(audio)
			
 
				-                        audio_b64 = base64.b64encode(buf.getvalue()).decode()
			
 
				+                    item = await get_or_create_sentence_cache_item(part, voice, speed, model)
			
 
				 
			
 
				                     yield json.dumps(
			
 
				                         {
			
 
				                             "index": idx,
			
 
				-                            "sentence": part,
			
 
				-                            "audio": audio_b64,
			
 
				-                            "sample_rate": sample_rate,
			
 
				+                            "sentence": item["sentence"],
			
 
				+                            "audio": item["audio"],
			
 
				+                            "sample_rate": item["sample_rate"],
			
 
				                         }
			
 
				                     ).encode() + b"\n"
			
 
				             finally:
			
@@ -646,6 +750,59 @@ async def generate_audio_stream(data: Dict = Body(...)):
 
				         return StreamingResponse(stream(), media_type="application/x-ndjson")
			
 
				 
			
 
				 
			
 
				+@app.post("/generate_pcm", summary="POST: 返回更轻量的 PCM 分片流")
			
 
				+async def generate_audio_pcm_stream(data: Dict = Body(...)):
			
 
				+    async with request_semaphore:
			
 
				+        text = data.get("text", "")
			
 
				+        voice = data.get("voice", "af_heart")
			
 
				+        speed = float(data.get("speed", 1.0))
			
 
				+        model = data.get("model_name", DEFAULT_MODEL_NAME)
			
 
				+
			
 
				+        if not text.strip():
			
 
				+            raise HTTPException(status_code=400, detail="文本不能为空")
			
 
				+
			
 
				+        parts = iter_text_parts(text, data.get("split_pattern", r"\n+"))
			
 
				+
			
 
				+        async def pcm_stream():
			
 
				+            for idx, part in enumerate(parts):
			
 
				+                item = await get_or_create_sentence_cache_item(part, voice, speed, model)
			
 
				+                payload = wav_to_pcm_payload(item["audio_bytes"])
			
 
				+                header = json.dumps(
			
 
				+                    {
			
 
				+                        "index": idx,
			
 
				+                        "sentence": item["sentence"],
			
 
				+                        "sample_rate": item["sample_rate"],
			
 
				+                        "format": "s16le",
			
 
				+                        "bytes": len(payload),
			
 
				+                    }
			
 
				+                ).encode() + b"\n"
			
 
				+                yield header
			
 
				+                yield payload
			
 
				+
			
 
				+        return StreamingResponse(
			
 
				+            pcm_stream(),
			
 
				+            media_type="application/octet-stream",
			
 
				+            headers={"X-Audio-Format": "s16le", "X-Sample-Rate": str(sample_rate)},
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+@app.get("/clear-cache")
			
 
				+async def clear_cache():
			
 
				+    with cache_lock:
			
 
				+        memory_cache.clear()
			
 
				+    for f in Path(CACHE_DIR).glob("*"):
			
 
				+        f.unlink()
			
 
				+    return {"status": "success"}
			
 
				+
			
 
				+
			
 
				+@app.get("/cache-info")
			
 
				+async def get_cache_info():
			
 
				+    with cache_lock:
			
 
				+        mem_count = len(memory_cache)
			
 
				+    disk_files = list(Path(CACHE_DIR).glob("*.wav"))
			
 
				+    return {"memory_cache": mem_count, "disk_cache": len(disk_files)}
			
 
				+
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				     import uvicorn
			
 
				 
			
--- a/start_onnx.sh
+++ b/start_onnx.sh
@@ -0,0 +1,4 @@
 
				+#!/bin/bash
			
 
				+eval "$(/root/miniconda3/bin/conda shell.bash hook)"
			
 
				+conda activate py311
			
 
				+nohup uvicorn speech_tts_onnx:app --host 0.0.0.0 --port 8028 &