Continous increase in Memory usage
Hugging Face Forums [Unofficial]
April 13, 2026
I have the same issue with Wav2vec2 model which is deployed on my local GPU system the continuous growth of RAM causes the system to crash, and model is loaded using huggingface pipeline.below is the code that I used
@router.post("/transcribe")
async def quran(audio_file: UploadFile = File(...)):
process = psutil.Process()
start_ram = process.memory_info().rss / (1024**2)
# Track paths for strict cleanup
temp_wav_path = None
try:
audio_bytes = await audio_file.read()
# 2. Load with Torchaudio
# waveform shape: [channels, time]
waveform, sample_rate = torchaudio.load(io.BytesIO(audio_bytes))
# 3. Pre-processing: Convert to Mono if Stereo
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# 4. Resample to 16kHz (Standard for most AI speech models)
target_sample_rate = 16000
if sample_rate != target_sample_rate:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
waveform = resampler(waveform)
# 5. Squeeze to 1D if your model expects [samples] instead of [1, samples]
# Most transformers models prefer a flat 1D array/tensor
input_tensor = waveform.squeeze()
# 6. Inference
with torch.no_grad(): # Reduce memory usage during inference
transcript_raw = ai_models["wav2vec"](input_tensor)
transcript = transcript_raw.get("text", "")
return JSONResponse(content={"transcript": transcript}, status_code=200)
except Exception as e:
logger.error(f"Transcription Error: {e}")
raise HTTPException(status_code=500, detail="Internal processing error")
finally:
# --- AGGRESSIVE CLEANUP ---
# 1. Delete file immediately
if temp_wav_path and os.path.exists(temp_wav_path):
try:
os.remove(temp_wav_path)
except: pass
# 2. Clear local tensors
if 'waveform' in locals():
del waveform
del audio_bytes
# 3. GPU and Python GC
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
# 4. FORCE OS RELEASE (The "Malloc Trim")
# This tells Linux to actually take the memory back
if libc:
libc.malloc_trim(0)
final_ram = process.memory_info().rss / (1024**2)
logger.info(f"RAM Status: {start_ram:.1f}MB -> {final_ram:.1f}MB")
Anyone can help.
Discussion in the ATmosphere