{
"$type": "site.standard.document",
"bskyPostRef": {
"cid": "bafyreihuremczyztxmazt7eican7o7ak6ozok4nakmfz3pssajhqldht64",
"uri": "at://did:plc:pgryn3ephfd2xgft23qokfzt/app.bsky.feed.post/3mjarddsxgzr2"
},
"path": "/t/streamer-ai-like-neuro-sama/33836?page=2#post_44",
"publishedAt": "2026-04-11T20:10:05.000Z",
"site": "https://discuss.huggingface.co",
"textContent": "here’s my personal whisper setup ya at the moment (cause whisper sucks)\nversion 1-\n\n\n # Use larger model for better transcription quality\n\n # Options: “tiny.en”, “base.en”, “small.en”, “medium.en”, “large”, “large-v3”\n\n # larger models are more accurate but slower -JK THEY SUCK\n\n # Try “large-v3” for best quality, or “large” if v3 is not available _DON'T IT SUCKS\n\n WHISPER_MODEL = “small.en” # Base.en and small are the best quality models in testing\n\n stt = whisper.load_model(WHISPER_MODEL)\n\n def safe_transcribe(audio_np, compression_ratio_threshold=2.4, logprob_threshold=-1.0, no_speech_threshold=0.6):\n\n if audio_np is None:\n\n\n\n\n console.print(\"\\[yellow\\]⚠️ safe_transcribe: audio_np is None\\[/yellow\\]\")\n\n return \"\"\n\n\n\n if audio_np.size < 16000: # < 1 second @ 16kHz\n\n console.print(f\"\\[yellow\\]⚠️ safe_transcribe: audio too short ({audio_np.size} < 16000)\\[/yellow\\]\")\n\n return \"\"\n\n\n\n if np.max(np.abs(audio_np)) < 0.01:\n\n console.print(f\"\\[yellow\\]⚠️ safe_transcribe: audio too quiet ({np.max(np.abs(audio_np)):.4f} < 0.01)\\[/yellow\\]\")\n\n return \"\"\n\n\n\n try:\n\n console.print(f\"\\[cyan\\]🔍 Whisper transcribing {len(audio_np)} samples...\\[/cyan\\]\")\n\n\n\n \\# Improved transcription parameters for better accuracy\n\n result = stt.transcribe(\n\n audio_np,\n\n fp16=False,\n\n language=\"en\", # Force English for better accuracy\n\n temperature=0.0,\n\n compression_ratio_threshold=compression_ratio_threshold,\n\n logprob_threshold=logprob_threshold,\n\n no_speech_threshold=no_speech_threshold,\n\n initial_prompt=\"This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times.\",\n\n condition_on_previous_text=True, # Use previous context for better transcription\n\n word_timestamps=False, # Disable for faster processing\n\n )\n\n\n\n console.print(f\"\\[cyan\\]🔍 Whisper result type: {type(result)}\\[/cyan\\]\")\n\n\n\n if isinstance(result, dict):\n\n text = result.get(\"text\", \"\").strip()\n\n console.print(f\"\\[cyan\\]🔍 Whisper dict result: '{text}'\\[/cyan\\]\")\n\n\n\n \\# Filter out initial_prompt leakage\n\n prompt_text = \"This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times.\"\n\n if prompt_text in text:\n\n text = text.replace(prompt_text, \"\").strip()\n\n console.print(f\"\\[green\\]🧹 Filtered prompt from transcription: '{text}'\\[/green\\]\")\n\n\n\n \\# Get additional debug info\n\n if \"segments\" in result and result\\[\"segments\"\\]:\n\n \\# Log average logprob as a quality indicator\n\n avg_logprob = sum(s.get(\"avg_logprob\", 0) for s in result\\[\"segments\"\\]) / len(result\\[\"segments\"\\])\n\n console.print(f\"\\[cyan\\]🔍 Transcription quality (avg_logprob): {avg_logprob:.2f}\\[/cyan\\]\")\n\n\n\n return text\n\n elif isinstance(result, str):\n\n console.print(f\"\\[cyan\\]🔍 Whisper string result: '{result}'\\[/cyan\\]\")\n\n\n\n \\# Filter out initial_prompt leakage\n\n prompt_text = \"This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times.\"\n\n if prompt_text in text:\n\n text = text.replace(prompt_text, \"\").strip()\n\n console.print(f\"\\[green\\]🧹 Filtered prompt from transcription: '{text}'\\[/green\\]\")\n\n\n\n return result.strip()\n\n else:\n\n console.print(f\"\\[yellow\\]⚠️ Whisper unexpected result type: {type(result)}\\[/yellow\\]\")\n\n return \"\"\n\n\n\n except Exception as e:\n\n console.print(f\"\\[red\\]❌ Whisper transcription error: {e}\\[/red\\]\")\n\n import traceback\n\n traceback.print_exc()\n\n return \"\"\n\n\nver 2- older\n\n\n # Use larger model for better transcription quality\n\n # Options: “tiny.en”, “base.en”, “small.en”, “medium.en”, “large”, “large-v3”\n\n # larger models are more accurate but slower\n\n # Try “large-v3” for best quality, or “large” if v3 is not available\n\n WHISPER_MODEL = “base.en” # Best quality model\n\n stt = whisper.load_model(WHISPER_MODEL)\n\n def safe_transcribe(audio_np, temperature=0.0, compression_ratio_threshold=2.4, logprob_threshold=-1.0, no_speech_threshold=0.6):\n\n\n\n\n\n if audio_np is None:\n\n console.print(\"\\[yellow\\]⚠️ safe_transcribe: audio_np is None\\[/yellow\\]\")\n\n return \"\"\n\n\n\n if audio_np.size < 16000: # < 1 second @ 16kHz\n\n console.print(f\"\\[yellow\\]⚠️ safe_transcribe: audio too short ({audio_np.size} < 16000)\\[/yellow\\]\")\n\n return \"\"\n\n\n\n if np.max(np.abs(audio_np)) < 0.01:\n\n console.print(f\"\\[yellow\\]⚠️ safe_transcribe: audio too quiet ({np.max(np.abs(audio_np)):.4f} < 0.01)\\[/yellow\\]\")\n\n return \"\"\n\n\n\n try:\n\n console.print(f\"\\[cyan\\]🔍 Whisper transcribing {len(audio_np)} samples...\\[/cyan\\]\")\n\n\n\n \\# Improved transcription parameters for better accuracy\n\n result = stt.transcribe(\n\n audio_np,\n\n fp16=False,\n\n language=\"en\", # Force English for better accuracy\n\n temperature=temperature, #or remove the other whisper and Just change this to 0.0\n\n compression_ratio_threshold=compression_ratio_threshold,\n\n logprob_threshold=logprob_threshold,\n\n no_speech_threshold=no_speech_threshold,\n\n initial_prompt=\"This is a clear spoken conversation. Please transcribe accurately.\", # You can remove the initial_prompt if ya want\n\n condition_on_previous_text=True, # Use previous context for better transcription\n\n word_timestamps=False, # Disable for faster processing\n\n )\n\n\n\n console.print(f\"\\[cyan\\]🔍 Whisper result type: {type(result)}\\[/cyan\\]\")\n\n\n\n if isinstance(result, dict):\n\n text = result.get(\"text\", \"\").strip()\n\n console.print(f\"\\[cyan\\]🔍 Whisper dict result: '{text}'\\[/cyan\\]\")\n\n\n\n \\# Get additional debug info\n\n if \"segments\" in result and result\\[\"segments\"\\]:\n\n \\# Log average logprob as a quality indicator\n\n avg_logprob = sum(s.get(\"avg_logprob\", 0) for s in result\\[\"segments\"\\]) / len(result\\[\"segments\"\\])\n\n console.print(f\"\\[cyan\\]🔍 Transcription quality (avg_logprob): {avg_logprob:.2f}\\[/cyan\\]\")\n\n\n\n return text\n\n elif isinstance(result, str):\n\n console.print(f\"\\[cyan\\]🔍 Whisper string result: '{result}'\\[/cyan\\]\")\n\n return result.strip()\n\n else:\n\n console.print(f\"\\[yellow\\]⚠️ Whisper unexpected result type: {type(result)}\\[/yellow\\]\")\n\n return \"\"\n\n\n\n except Exception as e:\n\n console.print(f\"\\[red\\]❌ Whisper transcription error: {e}\\[/red\\]\")\n\n import traceback\n\n traceback.print_exc()\n\n return \"\"\n\n\nor another onee that i just edit that seems to be giving good Results now that i just messed with it\n\n\n # Use larger model for better transcription quality\n\n # Options: “tiny.en”, “base.en”, “small.en”, “medium.en”, “large”, “large-v3”\n\n # larger models are more accurate but slower\n\n # Try “large-v3” for best quality, or “large” if v3 is not available\n\n WHISPER_MODEL = “small.en” # Best quality model\n\n stt = whisper.load_model(WHISPER_MODEL)\n\n def safe_transcribe(audio_np, compression_ratio_threshold=2.4, logprob_threshold=-1.0, no_speech_threshold=0.6):\n\n\n\n if audio_np is None:\n\n console.print(\"\\[yellow\\]⚠️ safe_transcribe: audio_np is None\\[/yellow\\]\")\n\n return \"\"\n\n # I KNOW that most of this is the same Don't judge me\n\n if audio_np.size < 16000: # < 1 second @ 16kHz\n\n console.print(f\"\\[yellow\\]⚠️ safe_transcribe: audio too short ({audio_np.size} < 16000)\\[/yellow\\]\")\n\n return \"\"\n\n\n\n if np.max(np.abs(audio_np)) < 0.02: #TURN DIS SHIT UP TO PREVERT digital hallucinations-\n\n console.print(f\"\\[yellow\\]⚠️ safe_transcribe: audio too quiet ({np.max(np.abs(audio_np)):.4f} < 0.01)\\[/yellow\\]\")\n\n return \"\"\n\n\n\n try:\n\n console.print(f\"\\[cyan\\]🔍 Whisper transcribing {len(audio_np)} samples...\\[/cyan\\]\")\n #as you can tell i actual add notes to the code\n\n\n \\# Improved transcription parameters for better accuracy\n\n result = stt.transcribe(\n\n audio_np,\n\n fp16=False,\n\n language=\"en\", # Force English for better accuracy\n\n temperature=0.0,\n\n compression_ratio_threshold=compression_ratio_threshold,\n\n logprob_threshold=logprob_threshold,\n\n no_speech_threshold=no_speech_threshold,#BRUH i had to remove initial twice\n\n condition_on_previous_text=False, # This Will repeat the same word 20 times... so set it to false trust me lol\n\n word_timestamps=False, # Disable for faster processing\n\n )\n\n\n\n console.print(f\"\\[cyan\\]🔍 Whisper result type: {type(result)}\\[/cyan\\]\")\n\n\n\n if isinstance(result, dict):\n\n text = result.get(\"text\", \"\").strip()\n\n console.print(f\"\\[cyan\\]🔍 Whisper dict result: '{text}'\\[/cyan\\]\")\n\n\n\n \\# Filter out initial_prompt leakage (silent)\n\n prompt_text = \"This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times.\"\n\n if prompt_text in text:\n\n text = text.replace(prompt_text, \"\").strip()\n\n\n\n \\# Get additional debug info\n\n if \"segments\" in result and result\\[\"segments\"\\]:\n\n \\# Log average logprob as a quality indicator\n\n avg_logprob = sum(s.get(\"avg_logprob\", 0) for s in result\\[\"segments\"\\]) / len(result\\[\"segments\"\\])\n\n console.print(f\"\\[cyan\\]🔍 Transcription quality (avg_logprob): {avg_logprob:.2f}\\[/cyan\\]\")\n\n\n\n return text\n\n elif isinstance(result, str):\n\n console.print(f\"\\[cyan\\]🔍 Whisper string result: '{result}'\\[/cyan\\]\")\n\n\n\n \\# Filter out initial_prompt leakage (silent)\n\n prompt_text = \"This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times.\"\n\n if prompt_text in result:\n\n result = result.replace(prompt_text, \"\").strip()\n\n\n\n return result.strip()\n\n else:\n\n console.print(f\"\\[yellow\\]⚠️ Whisper unexpected result type: {type(result)}\\[/yellow\\]\")\n\n return \"\"\n\n\n\n except Exception as e:\n\n console.print(f\"\\[red\\]❌ Whisper transcription error: {e}\\[/red\\]\")\n\n import traceback\n\n traceback.print_exc()\n\n return \"\"\n\n\nextra stuff that is needed\n\n\n # ------------------ CONSTANTS ------------------\n\n MAX_RECORD_SECONDS = 20\n\n NO_WORD_TIMEOUT = 10.5\n\n WORD_CHECK_INTERVAL = 1.0\n\n MIN_CHECK_AUDIO_SEC = 1.5\n",
"title": "Streamer AI (Like Neuro-Sama)"
}