Raw Record Source

{
  "$type": "site.standard.document",
  "bskyPostRef": {
    "cid": "bafyreihuremczyztxmazt7eican7o7ak6ozok4nakmfz3pssajhqldht64",
    "uri": "at://did:plc:pgryn3ephfd2xgft23qokfzt/app.bsky.feed.post/3mjarddsxgzr2"
  },
  "path": "/t/streamer-ai-like-neuro-sama/33836?page=2#post_44",
  "publishedAt": "2026-04-11T20:10:05.000Z",
  "site": "https://discuss.huggingface.co",
  "textContent": "here’s my personal whisper setup ya at the moment (cause whisper sucks)\nversion 1-\n\n\n    # Use larger model for better transcription quality\n\n    # Options: “tiny.en”, “base.en”, “small.en”, “medium.en”, “large”, “large-v3”\n\n    # larger models are more accurate but slower -JK THEY SUCK\n\n    # Try “large-v3” for best quality, or “large” if v3 is not available _DON'T IT SUCKS\n\n    WHISPER_MODEL = “small.en”  # Base.en and small are the best quality models in testing\n\n    stt = whisper.load_model(WHISPER_MODEL)\n\n    def safe_transcribe(audio_np, compression_ratio_threshold=2.4, logprob_threshold=-1.0, no_speech_threshold=0.6):\n\n    if audio_np is None:\n\n\n\n\n        console.print(\"\\[yellow\\]⚠️ safe_transcribe: audio_np is None\\[/yellow\\]\")\n\n        return \"\"\n\n\n\n    if audio_np.size < 16000:  # < 1 second @ 16kHz\n\n        console.print(f\"\\[yellow\\]⚠️ safe_transcribe: audio too short ({audio_np.size} < 16000)\\[/yellow\\]\")\n\n        return \"\"\n\n\n\n    if np.max(np.abs(audio_np)) < 0.01:\n\n        console.print(f\"\\[yellow\\]⚠️ safe_transcribe: audio too quiet ({np.max(np.abs(audio_np)):.4f} < 0.01)\\[/yellow\\]\")\n\n        return \"\"\n\n\n\n    try:\n\n        console.print(f\"\\[cyan\\]🔍 Whisper transcribing {len(audio_np)} samples...\\[/cyan\\]\")\n\n\n\n        \\# Improved transcription parameters for better accuracy\n\n        result = stt.transcribe(\n\n            audio_np,\n\n            fp16=False,\n\n            language=\"en\",  # Force English for better accuracy\n\n            temperature=0.0,\n\n            compression_ratio_threshold=compression_ratio_threshold,\n\n            logprob_threshold=logprob_threshold,\n\n            no_speech_threshold=no_speech_threshold,\n\n            initial_prompt=\"This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times.\",\n\n            condition_on_previous_text=True,  # Use previous context for better transcription\n\n            word_timestamps=False,  # Disable for faster processing\n\n        )\n\n\n\n        console.print(f\"\\[cyan\\]🔍 Whisper result type: {type(result)}\\[/cyan\\]\")\n\n\n\n        if isinstance(result, dict):\n\n            text = result.get(\"text\", \"\").strip()\n\n            console.print(f\"\\[cyan\\]🔍 Whisper dict result: '{text}'\\[/cyan\\]\")\n\n\n\n            \\# Filter out initial_prompt leakage\n\n            prompt_text = \"This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times.\"\n\n            if prompt_text in text:\n\n                text = text.replace(prompt_text, \"\").strip()\n\n                console.print(f\"\\[green\\]🧹 Filtered prompt from transcription: '{text}'\\[/green\\]\")\n\n\n\n            \\# Get additional debug info\n\n            if \"segments\" in result and result\\[\"segments\"\\]:\n\n                \\# Log average logprob as a quality indicator\n\n                avg_logprob = sum(s.get(\"avg_logprob\", 0) for s in result\\[\"segments\"\\]) / len(result\\[\"segments\"\\])\n\n                console.print(f\"\\[cyan\\]🔍 Transcription quality (avg_logprob): {avg_logprob:.2f}\\[/cyan\\]\")\n\n\n\n            return text\n\n        elif isinstance(result, str):\n\n            console.print(f\"\\[cyan\\]🔍 Whisper string result: '{result}'\\[/cyan\\]\")\n\n\n\n            \\# Filter out initial_prompt leakage\n\n            prompt_text = \"This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times.\"\n\n            if prompt_text in text:\n\n                text = text.replace(prompt_text, \"\").strip()\n\n                console.print(f\"\\[green\\]🧹 Filtered prompt from transcription: '{text}'\\[/green\\]\")\n\n\n\n            return result.strip()\n\n        else:\n\n            console.print(f\"\\[yellow\\]⚠️ Whisper unexpected result type: {type(result)}\\[/yellow\\]\")\n\n            return \"\"\n\n\n\n    except Exception as e:\n\n        console.print(f\"\\[red\\]❌ Whisper transcription error: {e}\\[/red\\]\")\n\n        import traceback\n\n        traceback.print_exc()\n\n        return \"\"\n\n\nver 2- older\n\n\n    # Use larger model for better transcription quality\n\n    # Options: “tiny.en”, “base.en”, “small.en”, “medium.en”, “large”, “large-v3”\n\n    # larger models are more accurate but slower\n\n    # Try “large-v3” for best quality, or “large” if v3 is not available\n\n    WHISPER_MODEL = “base.en”  # Best quality model\n\n    stt = whisper.load_model(WHISPER_MODEL)\n\n    def safe_transcribe(audio_np, temperature=0.0, compression_ratio_threshold=2.4, logprob_threshold=-1.0, no_speech_threshold=0.6):\n\n\n\n\n\n    if audio_np is None:\n\n        console.print(\"\\[yellow\\]⚠️ safe_transcribe: audio_np is None\\[/yellow\\]\")\n\n        return \"\"\n\n\n\n    if audio_np.size < 16000:  # < 1 second @ 16kHz\n\n        console.print(f\"\\[yellow\\]⚠️ safe_transcribe: audio too short ({audio_np.size} < 16000)\\[/yellow\\]\")\n\n        return \"\"\n\n\n\n    if np.max(np.abs(audio_np)) < 0.01:\n\n        console.print(f\"\\[yellow\\]⚠️ safe_transcribe: audio too quiet ({np.max(np.abs(audio_np)):.4f} < 0.01)\\[/yellow\\]\")\n\n        return \"\"\n\n\n\n    try:\n\n        console.print(f\"\\[cyan\\]🔍 Whisper transcribing {len(audio_np)} samples...\\[/cyan\\]\")\n\n\n\n        \\# Improved transcription parameters for better accuracy\n\n        result = stt.transcribe(\n\n            audio_np,\n\n            fp16=False,\n\n            language=\"en\",  # Force English for better accuracy\n\n            temperature=temperature, #or remove the other whisper and Just change this to 0.0\n\n            compression_ratio_threshold=compression_ratio_threshold,\n\n            logprob_threshold=logprob_threshold,\n\n            no_speech_threshold=no_speech_threshold,\n\n            initial_prompt=\"This is a clear spoken conversation. Please transcribe accurately.\",  # You can remove the initial_prompt if ya want\n\n            condition_on_previous_text=True,  # Use previous context for better transcription\n\n            word_timestamps=False,  # Disable for faster processing\n\n        )\n\n\n\n        console.print(f\"\\[cyan\\]🔍 Whisper result type: {type(result)}\\[/cyan\\]\")\n\n\n\n        if isinstance(result, dict):\n\n            text = result.get(\"text\", \"\").strip()\n\n            console.print(f\"\\[cyan\\]🔍 Whisper dict result: '{text}'\\[/cyan\\]\")\n\n\n\n            \\# Get additional debug info\n\n            if \"segments\" in result and result\\[\"segments\"\\]:\n\n                \\# Log average logprob as a quality indicator\n\n                avg_logprob = sum(s.get(\"avg_logprob\", 0) for s in result\\[\"segments\"\\]) / len(result\\[\"segments\"\\])\n\n                console.print(f\"\\[cyan\\]🔍 Transcription quality (avg_logprob): {avg_logprob:.2f}\\[/cyan\\]\")\n\n\n\n            return text\n\n        elif isinstance(result, str):\n\n            console.print(f\"\\[cyan\\]🔍 Whisper string result: '{result}'\\[/cyan\\]\")\n\n            return result.strip()\n\n        else:\n\n            console.print(f\"\\[yellow\\]⚠️ Whisper unexpected result type: {type(result)}\\[/yellow\\]\")\n\n            return \"\"\n\n\n\n    except Exception as e:\n\n        console.print(f\"\\[red\\]❌ Whisper transcription error: {e}\\[/red\\]\")\n\n        import traceback\n\n        traceback.print_exc()\n\n        return \"\"\n\n\nor another onee that i just edit that seems to be giving good Results now that i just messed with it\n\n\n    # Use larger model for better transcription quality\n\n    # Options: “tiny.en”, “base.en”, “small.en”, “medium.en”, “large”, “large-v3”\n\n    # larger models are more accurate but slower\n\n    # Try “large-v3” for best quality, or “large” if v3 is not available\n\n    WHISPER_MODEL = “small.en”  # Best quality model\n\n    stt = whisper.load_model(WHISPER_MODEL)\n\n    def safe_transcribe(audio_np, compression_ratio_threshold=2.4, logprob_threshold=-1.0, no_speech_threshold=0.6):\n\n\n\n    if audio_np is None:\n\n        console.print(\"\\[yellow\\]⚠️ safe_transcribe: audio_np is None\\[/yellow\\]\")\n\n        return \"\"\n\n    # I KNOW that most of this is the same Don't judge me\n\n    if audio_np.size < 16000:  # < 1 second @ 16kHz\n\n        console.print(f\"\\[yellow\\]⚠️ safe_transcribe: audio too short ({audio_np.size} < 16000)\\[/yellow\\]\")\n\n        return \"\"\n\n\n\n    if np.max(np.abs(audio_np)) < 0.02: #TURN DIS SHIT UP TO PREVERT digital hallucinations-\n\n        console.print(f\"\\[yellow\\]⚠️ safe_transcribe: audio too quiet ({np.max(np.abs(audio_np)):.4f} < 0.01)\\[/yellow\\]\")\n\n        return \"\"\n\n\n\n    try:\n\n        console.print(f\"\\[cyan\\]🔍 Whisper transcribing {len(audio_np)} samples...\\[/cyan\\]\")\n    #as you can tell i actual add notes to the code\n\n\n        \\# Improved transcription parameters for better accuracy\n\n        result = stt.transcribe(\n\n            audio_np,\n\n            fp16=False,\n\n            language=\"en\",  # Force English for better accuracy\n\n            temperature=0.0,\n\n            compression_ratio_threshold=compression_ratio_threshold,\n\n            logprob_threshold=logprob_threshold,\n\n            no_speech_threshold=no_speech_threshold,#BRUH i had to remove initial twice\n\n            condition_on_previous_text=False,  # This Will repeat the same word 20 times... so set it to false trust me lol\n\n            word_timestamps=False,  # Disable for faster processing\n\n        )\n\n\n\n        console.print(f\"\\[cyan\\]🔍 Whisper result type: {type(result)}\\[/cyan\\]\")\n\n\n\n        if isinstance(result, dict):\n\n            text = result.get(\"text\", \"\").strip()\n\n            console.print(f\"\\[cyan\\]🔍 Whisper dict result: '{text}'\\[/cyan\\]\")\n\n\n\n            \\# Filter out initial_prompt leakage (silent)\n\n            prompt_text = \"This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times.\"\n\n            if prompt_text in text:\n\n                text = text.replace(prompt_text, \"\").strip()\n\n\n\n            \\# Get additional debug info\n\n            if \"segments\" in result and result\\[\"segments\"\\]:\n\n                \\# Log average logprob as a quality indicator\n\n                avg_logprob = sum(s.get(\"avg_logprob\", 0) for s in result\\[\"segments\"\\]) / len(result\\[\"segments\"\\])\n\n                console.print(f\"\\[cyan\\]🔍 Transcription quality (avg_logprob): {avg_logprob:.2f}\\[/cyan\\]\")\n\n\n\n            return text\n\n        elif isinstance(result, str):\n\n            console.print(f\"\\[cyan\\]🔍 Whisper string result: '{result}'\\[/cyan\\]\")\n\n\n\n            \\# Filter out initial_prompt leakage (silent)\n\n            prompt_text = \"This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times.\"\n\n            if prompt_text in result:\n\n                result = result.replace(prompt_text, \"\").strip()\n\n\n\n            return result.strip()\n\n        else:\n\n            console.print(f\"\\[yellow\\]⚠️ Whisper unexpected result type: {type(result)}\\[/yellow\\]\")\n\n            return \"\"\n\n\n\n    except Exception as e:\n\n        console.print(f\"\\[red\\]❌ Whisper transcription error: {e}\\[/red\\]\")\n\n        import traceback\n\n        traceback.print_exc()\n\n        return \"\"\n\n\nextra stuff that is needed\n\n\n    # ------------------ CONSTANTS ------------------\n\n    MAX_RECORD_SECONDS = 20\n\n    NO_WORD_TIMEOUT = 10.5\n\n    WORD_CHECK_INTERVAL = 1.0\n\n    MIN_CHECK_AUDIO_SEC = 1.5\n",
  "title": "Streamer AI (Like Neuro-Sama)"
}