How can I tell the size of a model before downloading it?
Or depending on what hf_hub you hace
#!/usr/bin/env python3
from future import annotations
from huggingface_hub import HfApi from huggingface_hub.utils import HfHubHTTPError
def human_size(num_bytes: int | None) → str: if not num_bytes: return “0 B”
units = ["B", "KB", "MB", "GB", "TB"]
size = float(num_bytes)
for unit in units:
if size < 1024 or unit == units[-1]:
return f"{size:.2f} {unit}"
size /= 1024
return f"{size:.2f} TB"
def prompt_int(label: str, default: int) → int: value = input(f"{label} [{default}]: ").strip()
if not value:
return default
try:
parsed = int(value)
if parsed <= 0:
print(f"Using default: {default}")
return default
return parsed
except ValueError:
print(f"Invalid number. Using default: {default}")
return default
def prompt_bool(label: str, default: bool = False) → bool: default_text = “y” if default else “n” value = input(f"{label} [y/n, default {default_text}]: ").strip().lower()
if not value:
return default
return value in {"y", "yes", "true", "1"}
def prompt_choice(label: str, choices: list[str], default: str) → str: print(f"{label}:“) for index, choice in enumerate(choices, start=1): marker = " default” if choice == default else “” print(f" {index}. {choice}{marker}")
value = input(f"Choose 1-{len(choices)} [{default}]: ").strip()
if not value:
return default
if value in choices:
return value
try:
index = int(value)
if 1 <= index <= len(choices):
return choices[index - 1]
except ValueError:
pass
print(f"Invalid choice. Using default: {default}")
return default
def get_repo_size(api: HfApi, repo_id: str) → tuple[int, list[tuple[str, int]]]: info = api.model_info(repo_id, files_metadata=True)
files: list[tuple[str, int]] = []
total = 0
for sibling in info.siblings or []:
name = getattr(sibling, "rfilename", None) or getattr(sibling, "filename", None) or "<unknown>"
size = getattr(sibling, "size", None) or 0
size = int(size)
files.append((name, size))
total += size
files.sort(key=lambda x: x[1], reverse=True)
return total, files
def get_model_id(model) → str | None: return getattr(model, “modelId”, None) or getattr(model, “id”, None)
def sort_models_locally(models: list, sort: str, ascending: bool) → list: reverse = not ascending
if sort == "downloads":
models.sort(key=lambda m: getattr(m, "downloads", 0) or 0, reverse=reverse)
elif sort == "likes":
models.sort(key=lambda m: getattr(m, "likes", 0) or 0, reverse=reverse)
elif sort == "lastModified":
models.sort(key=lambda m: str(getattr(m, "lastModified", "") or ""), reverse=reverse)
elif sort == "createdAt":
models.sort(key=lambda m: str(getattr(m, "createdAt", "") or ""), reverse=reverse)
elif sort == "trendingScore":
models.sort(key=lambda m: getattr(m, "trendingScore", 0) or 0, reverse=reverse)
return models
def main() → int: print() print(“Hugging Face model size search”) print()
query = input("Search term, partial match is OK: ").strip()
if not query:
print("No search term entered. Exiting.")
return 1
limit = prompt_int("How many model results should I check?", 20)
sort = prompt_choice(
"Sort Hugging Face search results by",
["downloads", "likes", "lastModified", "createdAt", "trendingScore"],
"downloads",
)
ascending = prompt_bool("Ascending sort?", False)
show_files = prompt_bool("Show largest files for each model?", True)
top_files = 5
if show_files:
top_files = prompt_int("How many largest files per model?", 5)
size_sort = prompt_choice(
"Sort final output by computed repo size",
["desc", "asc", "none"],
"desc",
)
api = HfApi()
print()
print(f"Searching Hugging Face for: {query}")
print()
try:
models = list(api.list_models(
search=query,
limit=limit,
sort=sort,
full=True,
))
except TypeError:
models = list(api.list_models(
search=query,
limit=limit,
full=True,
))
models = sort_models_locally(models, sort=sort, ascending=ascending)
if not models:
print(f"No models found for search term: {query}")
return 1
rows = []
total_models = len(models)
for index, model in enumerate(models, start=1):
repo_id = get_model_id(model)
if not repo_id:
continue
print(f"[{index}/{total_models}] Checking {repo_id}...")
try:
total_size, files = get_repo_size(api, repo_id)
error = None
except HfHubHTTPError as e:
total_size = None
files = []
error = str(e)
except Exception as e:
total_size = None
files = []
error = f"{type(e).__name__}: {e}"
rows.append({
"repo_id": repo_id,
"size": total_size,
"downloads": getattr(model, "downloads", None),
"likes": getattr(model, "likes", None),
"pipeline": getattr(model, "pipeline_tag", None),
"error": error,
"files": files,
})
if size_sort == "asc":
rows.sort(key=lambda r: r["size"] or 0)
elif size_sort == "desc":
rows.sort(key=lambda r: r["size"] or 0, reverse=True)
print()
print(f"Search: {query}")
print()
print(f"{'SIZE':>12} {'DOWNLOADS':>10} {'LIKES':>7} {'TYPE':<24} MODEL")
print("-" * 110)
for row in rows:
size = human_size(row["size"])
downloads = row["downloads"] if row["downloads"] is not None else "-"
likes = row["likes"] if row["likes"] is not None else "-"
pipeline = row["pipeline"] or "-"
print(f"{size:>12} {downloads:>10} {likes:>7} {pipeline:<24} {row['repo_id']}")
if row["error"]:
print(f"{'':>12} error: {row['error']}")
if show_files and row["files"]:
for filename, file_size in row["files"][:top_files]:
print(f"{'':>12} {human_size(file_size):>10} {filename}")
print()
return 0
if name == “main ”: raise SystemExit(main())
Discussion in the ATmosphere