Streamer AI (Like Neuro-Sama)
Hugging Face Forums [Unofficial]
April 11, 2026
here’s my personal whisper setup ya at the moment (cause whisper sucks) version 1-
# Use larger model for better transcription quality
# Options: “tiny.en”, “base.en”, “small.en”, “medium.en”, “large”, “large-v3”
# larger models are more accurate but slower -JK THEY SUCK
# Try “large-v3” for best quality, or “large” if v3 is not available _DON'T IT SUCKS
WHISPER_MODEL = “small.en” # Base.en and small are the best quality models in testing
stt = whisper.load_model(WHISPER_MODEL)
def safe_transcribe(audio_np, compression_ratio_threshold=2.4, logprob_threshold=-1.0, no_speech_threshold=0.6):
if audio_np is None:
console.print("\[yellow\]⚠️ safe_transcribe: audio_np is None\[/yellow\]")
return ""
if audio_np.size < 16000: # < 1 second @ 16kHz
console.print(f"\[yellow\]⚠️ safe_transcribe: audio too short ({audio_np.size} < 16000)\[/yellow\]")
return ""
if np.max(np.abs(audio_np)) < 0.01:
console.print(f"\[yellow\]⚠️ safe_transcribe: audio too quiet ({np.max(np.abs(audio_np)):.4f} < 0.01)\[/yellow\]")
return ""
try:
console.print(f"\[cyan\]🔍 Whisper transcribing {len(audio_np)} samples...\[/cyan\]")
\# Improved transcription parameters for better accuracy
result = stt.transcribe(
audio_np,
fp16=False,
language="en", # Force English for better accuracy
temperature=0.0,
compression_ratio_threshold=compression_ratio_threshold,
logprob_threshold=logprob_threshold,
no_speech_threshold=no_speech_threshold,
initial_prompt="This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times.",
condition_on_previous_text=True, # Use previous context for better transcription
word_timestamps=False, # Disable for faster processing
)
console.print(f"\[cyan\]🔍 Whisper result type: {type(result)}\[/cyan\]")
if isinstance(result, dict):
text = result.get("text", "").strip()
console.print(f"\[cyan\]🔍 Whisper dict result: '{text}'\[/cyan\]")
\# Filter out initial_prompt leakage
prompt_text = "This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times."
if prompt_text in text:
text = text.replace(prompt_text, "").strip()
console.print(f"\[green\]🧹 Filtered prompt from transcription: '{text}'\[/green\]")
\# Get additional debug info
if "segments" in result and result\["segments"\]:
\# Log average logprob as a quality indicator
avg_logprob = sum(s.get("avg_logprob", 0) for s in result\["segments"\]) / len(result\["segments"\])
console.print(f"\[cyan\]🔍 Transcription quality (avg_logprob): {avg_logprob:.2f}\[/cyan\]")
return text
elif isinstance(result, str):
console.print(f"\[cyan\]🔍 Whisper string result: '{result}'\[/cyan\]")
\# Filter out initial_prompt leakage
prompt_text = "This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times."
if prompt_text in text:
text = text.replace(prompt_text, "").strip()
console.print(f"\[green\]🧹 Filtered prompt from transcription: '{text}'\[/green\]")
return result.strip()
else:
console.print(f"\[yellow\]⚠️ Whisper unexpected result type: {type(result)}\[/yellow\]")
return ""
except Exception as e:
console.print(f"\[red\]❌ Whisper transcription error: {e}\[/red\]")
import traceback
traceback.print_exc()
return ""
ver 2- older
# Use larger model for better transcription quality
# Options: “tiny.en”, “base.en”, “small.en”, “medium.en”, “large”, “large-v3”
# larger models are more accurate but slower
# Try “large-v3” for best quality, or “large” if v3 is not available
WHISPER_MODEL = “base.en” # Best quality model
stt = whisper.load_model(WHISPER_MODEL)
def safe_transcribe(audio_np, temperature=0.0, compression_ratio_threshold=2.4, logprob_threshold=-1.0, no_speech_threshold=0.6):
if audio_np is None:
console.print("\[yellow\]⚠️ safe_transcribe: audio_np is None\[/yellow\]")
return ""
if audio_np.size < 16000: # < 1 second @ 16kHz
console.print(f"\[yellow\]⚠️ safe_transcribe: audio too short ({audio_np.size} < 16000)\[/yellow\]")
return ""
if np.max(np.abs(audio_np)) < 0.01:
console.print(f"\[yellow\]⚠️ safe_transcribe: audio too quiet ({np.max(np.abs(audio_np)):.4f} < 0.01)\[/yellow\]")
return ""
try:
console.print(f"\[cyan\]🔍 Whisper transcribing {len(audio_np)} samples...\[/cyan\]")
\# Improved transcription parameters for better accuracy
result = stt.transcribe(
audio_np,
fp16=False,
language="en", # Force English for better accuracy
temperature=temperature, #or remove the other whisper and Just change this to 0.0
compression_ratio_threshold=compression_ratio_threshold,
logprob_threshold=logprob_threshold,
no_speech_threshold=no_speech_threshold,
initial_prompt="This is a clear spoken conversation. Please transcribe accurately.", # You can remove the initial_prompt if ya want
condition_on_previous_text=True, # Use previous context for better transcription
word_timestamps=False, # Disable for faster processing
)
console.print(f"\[cyan\]🔍 Whisper result type: {type(result)}\[/cyan\]")
if isinstance(result, dict):
text = result.get("text", "").strip()
console.print(f"\[cyan\]🔍 Whisper dict result: '{text}'\[/cyan\]")
\# Get additional debug info
if "segments" in result and result\["segments"\]:
\# Log average logprob as a quality indicator
avg_logprob = sum(s.get("avg_logprob", 0) for s in result\["segments"\]) / len(result\["segments"\])
console.print(f"\[cyan\]🔍 Transcription quality (avg_logprob): {avg_logprob:.2f}\[/cyan\]")
return text
elif isinstance(result, str):
console.print(f"\[cyan\]🔍 Whisper string result: '{result}'\[/cyan\]")
return result.strip()
else:
console.print(f"\[yellow\]⚠️ Whisper unexpected result type: {type(result)}\[/yellow\]")
return ""
except Exception as e:
console.print(f"\[red\]❌ Whisper transcription error: {e}\[/red\]")
import traceback
traceback.print_exc()
return ""
or another onee that i just edit that seems to be giving good Results now that i just messed with it
# Use larger model for better transcription quality
# Options: “tiny.en”, “base.en”, “small.en”, “medium.en”, “large”, “large-v3”
# larger models are more accurate but slower
# Try “large-v3” for best quality, or “large” if v3 is not available
WHISPER_MODEL = “small.en” # Best quality model
stt = whisper.load_model(WHISPER_MODEL)
def safe_transcribe(audio_np, compression_ratio_threshold=2.4, logprob_threshold=-1.0, no_speech_threshold=0.6):
if audio_np is None:
console.print("\[yellow\]⚠️ safe_transcribe: audio_np is None\[/yellow\]")
return ""
# I KNOW that most of this is the same Don't judge me
if audio_np.size < 16000: # < 1 second @ 16kHz
console.print(f"\[yellow\]⚠️ safe_transcribe: audio too short ({audio_np.size} < 16000)\[/yellow\]")
return ""
if np.max(np.abs(audio_np)) < 0.02: #TURN DIS SHIT UP TO PREVERT digital hallucinations-
console.print(f"\[yellow\]⚠️ safe_transcribe: audio too quiet ({np.max(np.abs(audio_np)):.4f} < 0.01)\[/yellow\]")
return ""
try:
console.print(f"\[cyan\]🔍 Whisper transcribing {len(audio_np)} samples...\[/cyan\]")
#as you can tell i actual add notes to the code
\# Improved transcription parameters for better accuracy
result = stt.transcribe(
audio_np,
fp16=False,
language="en", # Force English for better accuracy
temperature=0.0,
compression_ratio_threshold=compression_ratio_threshold,
logprob_threshold=logprob_threshold,
no_speech_threshold=no_speech_threshold,#BRUH i had to remove initial twice
condition_on_previous_text=False, # This Will repeat the same word 20 times... so set it to false trust me lol
word_timestamps=False, # Disable for faster processing
)
console.print(f"\[cyan\]🔍 Whisper result type: {type(result)}\[/cyan\]")
if isinstance(result, dict):
text = result.get("text", "").strip()
console.print(f"\[cyan\]🔍 Whisper dict result: '{text}'\[/cyan\]")
\# Filter out initial_prompt leakage (silent)
prompt_text = "This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times."
if prompt_text in text:
text = text.replace(prompt_text, "").strip()
\# Get additional debug info
if "segments" in result and result\["segments"\]:
\# Log average logprob as a quality indicator
avg_logprob = sum(s.get("avg_logprob", 0) for s in result\["segments"\]) / len(result\["segments"\])
console.print(f"\[cyan\]🔍 Transcription quality (avg_logprob): {avg_logprob:.2f}\[/cyan\]")
return text
elif isinstance(result, str):
console.print(f"\[cyan\]🔍 Whisper string result: '{result}'\[/cyan\]")
\# Filter out initial_prompt leakage (silent)
prompt_text = "This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times."
if prompt_text in result:
result = result.replace(prompt_text, "").strip()
return result.strip()
else:
console.print(f"\[yellow\]⚠️ Whisper unexpected result type: {type(result)}\[/yellow\]")
return ""
except Exception as e:
console.print(f"\[red\]❌ Whisper transcription error: {e}\[/red\]")
import traceback
traceback.print_exc()
return ""
extra stuff that is needed
# ------------------ CONSTANTS ------------------
MAX_RECORD_SECONDS = 20
NO_WORD_TIMEOUT = 10.5
WORD_CHECK_INTERVAL = 1.0
MIN_CHECK_AUDIO_SEC = 1.5
Discussion in the ATmosphere