WordGen
Collection
They all scale from five thousand parameters to five-hundred and fifty-nine thousand parameters. • 6 items • Updated
LargeWord is the largest model in the WordGen family and has about 1.59M parameters.
LargeWord generates plausible or real words learned from its pretraining dataset.
| Parameter | Value |
|---|---|
| hidden_size | 160 |
| num_hidden_layers | 4 |
| num_attention_heads | 2 |
| num_key_value_heads | 2 |
| intermediate_size | 512 |
| max_position_embeddings | 77 |
| rope_theta | 10000.0 |
| tie_word_embeddings | True |
| vocab_size | 1204 |
LargeWord was trained on 753,232 words and 4,153,110 tokens. Its goal is to generate plausible-looking or real words.
LargeWord was trained on an NVIDIA RTX 2060 6GB for 2 epochs with a batch size of 8.
| Step | Epoch | Train Loss | Train PPL | Eval Loss | Eval PPL |
|---|---|---|---|---|---|
| 500 | 0.30 | 4.3276 | 75.74 | 2.4190 | 11.23 |
| 1000 | 0.61 | 1.7151 | 5.56 | 1.4076 | 4.09 |
| 1500 | 0.91 | 1.3247 | 3.76 | 1.2682 | 3.55 |
| 2000 | 1.21 | 1.2120 | 3.36 | 1.2026 | 3.33 |
| 2500 | 1.51 | 1.1619 | 3.20 | 1.1667 | 3.21 |
| 3000 | 1.82 | 1.1314 | 3.10 | 1.1378 | 3.12 |
Prompt: w
Output:
weldosfish's
Prompt: app
Output:
appardness
Prompt: z
Output:
zeething's
# =============================================================================
# Inference
# =============================================================================
MODEL_DIR = "Harley-ml/LargeWord-1.5M" # path
TOKENIZER_PATH = MODEL_DIR
# --- Generation settings ---
PROMPT = "a" # prompt
MAX_NEW_TOKENS = 16
TEMPERATURE = 1.2
TOP_P = 0.95
TOP_K = 200
REPETITION_PENALTY = 1.1
DO_SAMPLE = True
# =============================================================================
import torch
from pathlib import Path
from transformers import (
AutoModelForCausalLM,
PreTrainedTokenizerFast,
AddedToken,
)
# ---------------------------------------------------------------------------
# Device
# ---------------------------------------------------------------------------
device = (
"cuda" if torch.cuda.is_available() else
"mps" if torch.backends.mps.is_available() else
"cpu"
)
print(f"Device : {device}")
# ---------------------------------------------------------------------------
# Tokenizer (mirrors training setup)
# ---------------------------------------------------------------------------
def load_tokenizer(path: str):
p = Path(path).resolve()
if not p.exists():
raise FileNotFoundError(f"Tokenizer not found: {p}")
tok = PreTrainedTokenizerFast(tokenizer_file=str(p))
specials = {}
if tok.bos_token is None: specials["bos_token"] = AddedToken("<|bos|>", special=True)
if tok.eos_token is None: specials["eos_token"] = AddedToken("<|eos|>", special=True)
if tok.unk_token is None: specials["unk_token"] = AddedToken("<|unk|>", special=True)
if tok.pad_token is None:
if tok.eos_token is not None:
tok.pad_token = tok.eos_token
else:
specials["pad_token"] = AddedToken("<|pad|>", special=True)
if specials:
tok.add_special_tokens(specials)
tok.padding_side = "left" # left-pad for batched generation
return tok
print("Loading tokenizer...")
tokenizer = load_tokenizer(TOKENIZER_PATH)
print(f" Vocab size : {tokenizer.vocab_size}")
print(f" BOS : {tokenizer.bos_token!r}")
print(f" EOS : {tokenizer.eos_token!r}")
print(f" PAD : {tokenizer.pad_token!r} (id={tokenizer.pad_token_id})")
# ---------------------------------------------------------------------------
# Model
# ---------------------------------------------------------------------------
print(f"\nLoading model from {MODEL_DIR} ...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_DIR,
dtype=torch.float16 if device == "cuda" else torch.float32,
low_cpu_mem_usage=True,
)
model.eval()
model.to(device)
total_params = sum(p.numel() for p in model.parameters())
print(f" Parameters : {total_params:,}")
# ---------------------------------------------------------------------------
# Generation helper
# ---------------------------------------------------------------------------
def generate(
prompt: str = PROMPT,
max_new_tokens: int = MAX_NEW_TOKENS,
temperature: float = TEMPERATURE,
top_p: float = TOP_P,
top_k: int = TOP_K,
repetition_penalty: float = REPETITION_PENALTY,
do_sample: bool = DO_SAMPLE,
) -> str:
bos = tokenizer.bos_token or ""
full_prompt = bos + prompt
inputs = tokenizer(
full_prompt,
return_tensors="pt",
add_special_tokens=False,
).to(device)
inputs.pop("token_type_ids", None) # Qwen3 doesn't use this
gen_kwargs = dict(
max_new_tokens = max_new_tokens,
do_sample = do_sample,
repetition_penalty = repetition_penalty,
eos_token_id = tokenizer.eos_token_id,
pad_token_id = tokenizer.pad_token_id,
)
if do_sample:
gen_kwargs["temperature"] = temperature
gen_kwargs["top_p"] = top_p
gen_kwargs["top_k"] = top_k
with torch.inference_mode():
output_ids = model.generate(**inputs, **gen_kwargs)
# Strip the prompt tokens so we only return what was generated
prompt_len = inputs["input_ids"].shape[-1]
new_ids = output_ids[0][prompt_len:]
return tokenizer.decode(new_ids, skip_special_tokens=True)
# ---------------------------------------------------------------------------
# Run
# ---------------------------------------------------------------------------
if __name__ == "__main__":
print(f"\nPrompt : {PROMPT!r}")
print("-" * 60)
output = generate(PROMPT)
print("Generated:")
print(output)