External Publication
Visit Post

Streamer AI (Like Neuro-Sama)

Hugging Face Forums [Unofficial] April 11, 2026
Source

here’s my personal whisper setup ya at the moment (cause whisper sucks) version 1-

# Use larger model for better transcription quality

# Options: “tiny.en”, “base.en”, “small.en”, “medium.en”, “large”, “large-v3”

# larger models are more accurate but slower -JK THEY SUCK

# Try “large-v3” for best quality, or “large” if v3 is not available _DON'T IT SUCKS

WHISPER_MODEL = “small.en”  # Base.en and small are the best quality models in testing

stt = whisper.load_model(WHISPER_MODEL)

def safe_transcribe(audio_np, compression_ratio_threshold=2.4, logprob_threshold=-1.0, no_speech_threshold=0.6):

if audio_np is None:




    console.print("\[yellow\]⚠️ safe_transcribe: audio_np is None\[/yellow\]")

    return ""



if audio_np.size < 16000:  # < 1 second @ 16kHz

    console.print(f"\[yellow\]⚠️ safe_transcribe: audio too short ({audio_np.size} < 16000)\[/yellow\]")

    return ""



if np.max(np.abs(audio_np)) < 0.01:

    console.print(f"\[yellow\]⚠️ safe_transcribe: audio too quiet ({np.max(np.abs(audio_np)):.4f} < 0.01)\[/yellow\]")

    return ""



try:

    console.print(f"\[cyan\]🔍 Whisper transcribing {len(audio_np)} samples...\[/cyan\]")



    \# Improved transcription parameters for better accuracy

    result = stt.transcribe(

        audio_np,

        fp16=False,

        language="en",  # Force English for better accuracy

        temperature=0.0,

        compression_ratio_threshold=compression_ratio_threshold,

        logprob_threshold=logprob_threshold,

        no_speech_threshold=no_speech_threshold,

        initial_prompt="This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times.",

        condition_on_previous_text=True,  # Use previous context for better transcription

        word_timestamps=False,  # Disable for faster processing

    )



    console.print(f"\[cyan\]🔍 Whisper result type: {type(result)}\[/cyan\]")



    if isinstance(result, dict):

        text = result.get("text", "").strip()

        console.print(f"\[cyan\]🔍 Whisper dict result: '{text}'\[/cyan\]")



        \# Filter out initial_prompt leakage

        prompt_text = "This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times."

        if prompt_text in text:

            text = text.replace(prompt_text, "").strip()

            console.print(f"\[green\]🧹 Filtered prompt from transcription: '{text}'\[/green\]")



        \# Get additional debug info

        if "segments" in result and result\["segments"\]:

            \# Log average logprob as a quality indicator

            avg_logprob = sum(s.get("avg_logprob", 0) for s in result\["segments"\]) / len(result\["segments"\])

            console.print(f"\[cyan\]🔍 Transcription quality (avg_logprob): {avg_logprob:.2f}\[/cyan\]")



        return text

    elif isinstance(result, str):

        console.print(f"\[cyan\]🔍 Whisper string result: '{result}'\[/cyan\]")



        \# Filter out initial_prompt leakage

        prompt_text = "This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times."

        if prompt_text in text:

            text = text.replace(prompt_text, "").strip()

            console.print(f"\[green\]🧹 Filtered prompt from transcription: '{text}'\[/green\]")



        return result.strip()

    else:

        console.print(f"\[yellow\]⚠️ Whisper unexpected result type: {type(result)}\[/yellow\]")

        return ""



except Exception as e:

    console.print(f"\[red\]❌ Whisper transcription error: {e}\[/red\]")

    import traceback

    traceback.print_exc()

    return ""

ver 2- older

# Use larger model for better transcription quality

# Options: “tiny.en”, “base.en”, “small.en”, “medium.en”, “large”, “large-v3”

# larger models are more accurate but slower

# Try “large-v3” for best quality, or “large” if v3 is not available

WHISPER_MODEL = “base.en”  # Best quality model

stt = whisper.load_model(WHISPER_MODEL)

def safe_transcribe(audio_np, temperature=0.0, compression_ratio_threshold=2.4, logprob_threshold=-1.0, no_speech_threshold=0.6):





if audio_np is None:

    console.print("\[yellow\]⚠️ safe_transcribe: audio_np is None\[/yellow\]")

    return ""



if audio_np.size < 16000:  # < 1 second @ 16kHz

    console.print(f"\[yellow\]⚠️ safe_transcribe: audio too short ({audio_np.size} < 16000)\[/yellow\]")

    return ""



if np.max(np.abs(audio_np)) < 0.01:

    console.print(f"\[yellow\]⚠️ safe_transcribe: audio too quiet ({np.max(np.abs(audio_np)):.4f} < 0.01)\[/yellow\]")

    return ""



try:

    console.print(f"\[cyan\]🔍 Whisper transcribing {len(audio_np)} samples...\[/cyan\]")



    \# Improved transcription parameters for better accuracy

    result = stt.transcribe(

        audio_np,

        fp16=False,

        language="en",  # Force English for better accuracy

        temperature=temperature, #or remove the other whisper and Just change this to 0.0

        compression_ratio_threshold=compression_ratio_threshold,

        logprob_threshold=logprob_threshold,

        no_speech_threshold=no_speech_threshold,

        initial_prompt="This is a clear spoken conversation. Please transcribe accurately.",  # You can remove the initial_prompt if ya want

        condition_on_previous_text=True,  # Use previous context for better transcription

        word_timestamps=False,  # Disable for faster processing

    )



    console.print(f"\[cyan\]🔍 Whisper result type: {type(result)}\[/cyan\]")



    if isinstance(result, dict):

        text = result.get("text", "").strip()

        console.print(f"\[cyan\]🔍 Whisper dict result: '{text}'\[/cyan\]")



        \# Get additional debug info

        if "segments" in result and result\["segments"\]:

            \# Log average logprob as a quality indicator

            avg_logprob = sum(s.get("avg_logprob", 0) for s in result\["segments"\]) / len(result\["segments"\])

            console.print(f"\[cyan\]🔍 Transcription quality (avg_logprob): {avg_logprob:.2f}\[/cyan\]")



        return text

    elif isinstance(result, str):

        console.print(f"\[cyan\]🔍 Whisper string result: '{result}'\[/cyan\]")

        return result.strip()

    else:

        console.print(f"\[yellow\]⚠️ Whisper unexpected result type: {type(result)}\[/yellow\]")

        return ""



except Exception as e:

    console.print(f"\[red\]❌ Whisper transcription error: {e}\[/red\]")

    import traceback

    traceback.print_exc()

    return ""

or another onee that i just edit that seems to be giving good Results now that i just messed with it

# Use larger model for better transcription quality

# Options: “tiny.en”, “base.en”, “small.en”, “medium.en”, “large”, “large-v3”

# larger models are more accurate but slower

# Try “large-v3” for best quality, or “large” if v3 is not available

WHISPER_MODEL = “small.en”  # Best quality model

stt = whisper.load_model(WHISPER_MODEL)

def safe_transcribe(audio_np, compression_ratio_threshold=2.4, logprob_threshold=-1.0, no_speech_threshold=0.6):



if audio_np is None:

    console.print("\[yellow\]⚠️ safe_transcribe: audio_np is None\[/yellow\]")

    return ""

# I KNOW that most of this is the same Don't judge me

if audio_np.size < 16000:  # < 1 second @ 16kHz

    console.print(f"\[yellow\]⚠️ safe_transcribe: audio too short ({audio_np.size} < 16000)\[/yellow\]")

    return ""



if np.max(np.abs(audio_np)) < 0.02: #TURN DIS SHIT UP TO PREVERT digital hallucinations-

    console.print(f"\[yellow\]⚠️ safe_transcribe: audio too quiet ({np.max(np.abs(audio_np)):.4f} < 0.01)\[/yellow\]")

    return ""



try:

    console.print(f"\[cyan\]🔍 Whisper transcribing {len(audio_np)} samples...\[/cyan\]")
#as you can tell i actual add notes to the code


    \# Improved transcription parameters for better accuracy

    result = stt.transcribe(

        audio_np,

        fp16=False,

        language="en",  # Force English for better accuracy

        temperature=0.0,

        compression_ratio_threshold=compression_ratio_threshold,

        logprob_threshold=logprob_threshold,

        no_speech_threshold=no_speech_threshold,#BRUH i had to remove initial twice

        condition_on_previous_text=False,  # This Will repeat the same word 20 times... so set it to false trust me lol

        word_timestamps=False,  # Disable for faster processing

    )



    console.print(f"\[cyan\]🔍 Whisper result type: {type(result)}\[/cyan\]")



    if isinstance(result, dict):

        text = result.get("text", "").strip()

        console.print(f"\[cyan\]🔍 Whisper dict result: '{text}'\[/cyan\]")



        \# Filter out initial_prompt leakage (silent)

        prompt_text = "This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times."

        if prompt_text in text:

            text = text.replace(prompt_text, "").strip()



        \# Get additional debug info

        if "segments" in result and result\["segments"\]:

            \# Log average logprob as a quality indicator

            avg_logprob = sum(s.get("avg_logprob", 0) for s in result\["segments"\]) / len(result\["segments"\])

            console.print(f"\[cyan\]🔍 Transcription quality (avg_logprob): {avg_logprob:.2f}\[/cyan\]")



        return text

    elif isinstance(result, str):

        console.print(f"\[cyan\]🔍 Whisper string result: '{result}'\[/cyan\]")



        \# Filter out initial_prompt leakage (silent)

        prompt_text = "This is a clear spoken conversation improve transcription to where it sounds like something a human would say do not repeat words multiple times."

        if prompt_text in result:

            result = result.replace(prompt_text, "").strip()



        return result.strip()

    else:

        console.print(f"\[yellow\]⚠️ Whisper unexpected result type: {type(result)}\[/yellow\]")

        return ""



except Exception as e:

    console.print(f"\[red\]❌ Whisper transcription error: {e}\[/red\]")

    import traceback

    traceback.print_exc()

    return ""

extra stuff that is needed

# ------------------ CONSTANTS ------------------

MAX_RECORD_SECONDS = 20

NO_WORD_TIMEOUT = 10.5

WORD_CHECK_INTERVAL = 1.0

MIN_CHECK_AUDIO_SEC = 1.5

Discussion in the ATmosphere

Loading comments...