Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,14 @@ __pycache__/
modded-nanogpt/
modded-nanogpt
data/datasets
data/*_local_build/
data/manifest.json
data/docs_selected.jsonl
.mypy_cache/
.venv
logs/
logs/
final_model.pt
final_model.int8.ptz
tools/RunMonitor/bin/
tools/RunMonitor/obj/
tinkering/
29 changes: 29 additions & 0 deletions FunProject.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.5.2.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{07C2787E-EAC7-C090-1BA3-A61EC2A24D84}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RunMonitor", "tools\RunMonitor\RunMonitor.csproj", "{DE6DDB0B-4D95-4B24-381D-64E74C9B1199}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{DE6DDB0B-4D95-4B24-381D-64E74C9B1199}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{DE6DDB0B-4D95-4B24-381D-64E74C9B1199}.Debug|Any CPU.Build.0 = Debug|Any CPU
{DE6DDB0B-4D95-4B24-381D-64E74C9B1199}.Release|Any CPU.ActiveCfg = Release|Any CPU
{DE6DDB0B-4D95-4B24-381D-64E74C9B1199}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{DE6DDB0B-4D95-4B24-381D-64E74C9B1199} = {07C2787E-EAC7-C090-1BA3-A61EC2A24D84}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {96B455A5-4CC6-4905-9123-7FCEC67532C8}
EndGlobalSection
EndGlobal
4 changes: 4 additions & 0 deletions currentspecs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Baseline after initial easy wins:
val_loss: 3.4190
val_bpb: 2.0916
artifact_bytes: 6831983
377 changes: 377 additions & 0 deletions docs/research_tracks.md

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions run_monitor.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
@echo off
setlocal
powershell -NoProfile -ExecutionPolicy Bypass -File "%~dp0scripts\run_monitor.ps1" %*
128 changes: 128 additions & 0 deletions scripts/build_sp1024_local_subset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from __future__ import annotations

import argparse
import importlib.util
import json
import shutil
import sys
from pathlib import Path


def load_export_module(repo_root: Path):
module_path = repo_root / "data" / "download_hf_docs_and_tokenize.py"
spec = importlib.util.spec_from_file_location("local_export_module_sp1024", module_path)
if spec is None or spec.loader is None:
raise RuntimeError(f"failed to load export module from {module_path}")
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module
spec.loader.exec_module(module)
return module


def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Export a local SP-1024 subset dataset from an existing selected-doc prefix")
parser.add_argument("--docs-jsonl", required=True)
parser.add_argument("--num-docs", type=int, default=120000)
parser.add_argument("--num-val-docs", type=int, default=50000)
parser.add_argument("--chunk-tokens", type=int, default=20000000)
parser.add_argument("--dataset-name", default="fineweb10B_sp1024_local120k")
parser.add_argument("--output-root", default="")
return parser


def main() -> None:
args = build_parser().parse_args()
if args.num_docs <= args.num_val_docs:
raise ValueError("--num-docs must be larger than --num-val-docs")

repo_root = Path(__file__).resolve().parents[1]
docs_jsonl = Path(args.docs_jsonl).expanduser().resolve()
if not docs_jsonl.is_file():
raise FileNotFoundError(docs_jsonl)

output_root = (
Path(args.output_root).expanduser().resolve()
if args.output_root
else (repo_root / "data" / "sp1024_local_build").resolve()
)
output_root.mkdir(parents=True, exist_ok=True)
tokenizers_dir = output_root / "tokenizers"
datasets_dir = output_root / "datasets"
tokenizers_dir.mkdir(parents=True, exist_ok=True)
datasets_dir.mkdir(parents=True, exist_ok=True)

export_module = load_export_module(repo_root)
source_tokenizer = repo_root / "data" / "tokenizers" / "fineweb_1024_bpe.model"
if not source_tokenizer.is_file():
raise FileNotFoundError(source_tokenizer)

spec = {
"name": "sp_bpe_1024_local",
"dataset_suffix": "sp1024_local120k",
"vocab_size": 1024,
"model_prefix": "fineweb_1024_bpe_local",
"reuse_model_path": str(source_tokenizer),
}
tok = export_module.build_sentencepiece_tokenizer(
spec=spec,
docs_jsonl=docs_jsonl,
tokenizers_dir=tokenizers_dir,
)

output_dir = datasets_dir / args.dataset_name
stats = export_module.export_shards(
docs_jsonl,
tok,
output_dir,
num_val_docs=int(args.num_val_docs),
shard_size=int(args.chunk_tokens),
docs_total=int(args.num_docs),
)

manifest = {
"version": "local_subset",
"num_docs": int(args.num_docs),
"num_val_docs": int(args.num_val_docs),
"docs_jsonl": str(docs_jsonl),
"tokenizers": [
{
"name": tok["name"],
"kind": tok["kind"],
"vocab_size": int(tok["vocab_size"]),
"bos_id": int(tok["bos_id"]),
"eos_id": int(tok["eos_id"]),
"recommended_bigram_vocab_size": int(((int(tok["vocab_size"]) + 127) // 128) * 128 * 5),
"source_spec": spec,
**tok["manifest"],
}
],
"datasets": [
{
"name": args.dataset_name,
"tokenizer_name": tok["name"],
"tokenizer_kind": tok["kind"],
"path": str(output_dir),
"train_glob": str(output_dir / "fineweb_train_*.bin"),
"val_glob": str(output_dir / "fineweb_val_*.bin"),
"vocab_size": int(tok["vocab_size"]),
"bos_id": int(tok["bos_id"]),
"eos_id": int(tok["eos_id"]),
"recommended_bigram_vocab_size": int(((int(tok["vocab_size"]) + 127) // 128) * 128 * 5),
"stats": stats,
}
],
}
manifest = export_module.relativize_manifest_paths(manifest, output_root)
(output_root / "manifest.json").write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8")

target_dataset_dir = repo_root / "data" / "datasets" / args.dataset_name
if target_dataset_dir.exists():
shutil.rmtree(target_dataset_dir)
shutil.copytree(output_dir, target_dataset_dir)

print(f"dataset_dir:{target_dataset_dir}", flush=True)
print(f"dataset_stats:{json.dumps(stats, sort_keys=True)}", flush=True)


if __name__ == "__main__":
main()
188 changes: 188 additions & 0 deletions scripts/build_sp4096_local_subset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
from __future__ import annotations

import argparse
import importlib.util
import json
import math
import shutil
import sys
import urllib.request
from pathlib import Path


HF_REPO_ID = "willdepueoai/parameter-golf"
HF_ROOT = "datasets"
DOCS_URL = f"https://huggingface.co/datasets/{HF_REPO_ID}/resolve/main/{HF_ROOT}/docs_selected.jsonl"
SIDECAR_URL = f"https://huggingface.co/datasets/{HF_REPO_ID}/resolve/main/{HF_ROOT}/docs_selected.source_manifest.json"


def load_export_module(repo_root: Path):
module_path = repo_root / "data" / "download_hf_docs_and_tokenize.py"
spec = importlib.util.spec_from_file_location("local_export_module", module_path)
if spec is None or spec.loader is None:
raise RuntimeError(f"failed to load export module from {module_path}")
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module
spec.loader.exec_module(module)
return module


def download_json(url: str) -> dict:
with urllib.request.urlopen(url, timeout=60) as response:
return json.loads(response.read().decode("utf-8"))


def stream_doc_prefix(
*,
docs_url: str,
tokenizer_docs_path: Path,
export_docs_path: Path,
tokenizer_train_docs: int,
export_docs: int,
) -> None:
max_docs = max(tokenizer_train_docs, export_docs)
tokenizer_docs_path.parent.mkdir(parents=True, exist_ok=True)
export_docs_path.parent.mkdir(parents=True, exist_ok=True)
with urllib.request.urlopen(docs_url, timeout=60) as response, tokenizer_docs_path.open("w", encoding="utf-8") as tok_out, export_docs_path.open("w", encoding="utf-8") as exp_out:
for idx, raw_line in enumerate(response, start=1):
line = raw_line.decode("utf-8")
if idx <= tokenizer_train_docs:
tok_out.write(line)
if idx <= export_docs:
exp_out.write(line)
if idx % 50000 == 0:
print(f"downloaded_docs:{idx}", flush=True)
if idx >= max_docs:
break


def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Build a lightweight local SP-4096 subset export from the published selected-doc stream")
parser.add_argument("--output-root", required=True)
parser.add_argument("--tokenizer-train-docs", type=int, default=500000)
parser.add_argument("--export-docs", type=int, default=120000)
parser.add_argument("--num-val-docs", type=int, default=50000)
parser.add_argument("--chunk-tokens", type=int, default=20000000)
return parser


def main() -> None:
args = build_parser().parse_args()
if args.export_docs <= args.num_val_docs:
raise ValueError("--export-docs must be larger than --num-val-docs")

repo_root = Path(__file__).resolve().parents[1]
output_root = Path(args.output_root).expanduser().resolve()
output_root.mkdir(parents=True, exist_ok=True)
tokenizers_dir = output_root / "tokenizers"
datasets_dir = output_root / "datasets"
tokenizers_dir.mkdir(parents=True, exist_ok=True)
datasets_dir.mkdir(parents=True, exist_ok=True)

tokenizer_docs_jsonl = output_root / "docs_selected_tokenizer_train.jsonl"
export_docs_jsonl = output_root / "docs_selected.jsonl"
export_sidecar = output_root / "docs_selected.source_manifest.json"

if not tokenizer_docs_jsonl.is_file() or not export_docs_jsonl.is_file():
stream_doc_prefix(
docs_url=DOCS_URL,
tokenizer_docs_path=tokenizer_docs_jsonl,
export_docs_path=export_docs_jsonl,
tokenizer_train_docs=args.tokenizer_train_docs,
export_docs=args.export_docs,
)

source_sidecar = download_json(SIDECAR_URL)
subset_sidecar = {
"source_repo_id": HF_REPO_ID,
"source_remote_root": HF_ROOT,
"source_num_docs": source_sidecar.get("num_docs"),
"source_docs_val": source_sidecar.get("docs_val"),
"num_docs": int(args.export_docs),
"docs_val": int(args.num_val_docs),
"docs_sha256": None,
"subset_kind": "prefix",
"tokenizer_train_docs": int(args.tokenizer_train_docs),
}
export_sidecar.write_text(json.dumps(subset_sidecar, indent=2) + "\n", encoding="utf-8")

export_module = load_export_module(repo_root)
spec = {
"name": "sp_bpe_4096",
"dataset_suffix": "sp4096_local",
"vocab_size": 4096,
"model_prefix": "fineweb_4096_bpe",
"tokenizer_train_docs": int(args.tokenizer_train_docs),
}
tok = export_module.build_sentencepiece_tokenizer(
spec=spec,
docs_jsonl=tokenizer_docs_jsonl,
tokenizers_dir=tokenizers_dir,
)
dataset_name = "fineweb10B_sp4096_local"
output_dir = datasets_dir / dataset_name
stats = export_module.export_shards(
export_docs_jsonl,
tok,
output_dir,
num_val_docs=int(args.num_val_docs),
shard_size=int(args.chunk_tokens),
docs_total=int(args.export_docs),
)

recommended_bigram_vocab_size = int(((int(tok["vocab_size"]) + 127) // 128) * 128 * 5)
manifest = {
"version": "local_subset",
"num_docs": int(args.export_docs),
"num_val_docs": int(args.num_val_docs),
"tokenizer_train_docs": int(args.tokenizer_train_docs),
"shard_size": int(args.chunk_tokens),
"docs_jsonl": str(export_docs_jsonl),
"tokenizers": [
{
"name": tok["name"],
"kind": tok["kind"],
"vocab_size": int(tok["vocab_size"]),
"bos_id": int(tok["bos_id"]),
"eos_id": int(tok["eos_id"]),
"recommended_bigram_vocab_size": recommended_bigram_vocab_size,
"source_spec": spec,
**tok["manifest"],
}
],
"datasets": [
{
"name": dataset_name,
"tokenizer_name": tok["name"],
"tokenizer_kind": tok["kind"],
"path": str(output_dir),
"train_glob": str(output_dir / "fineweb_train_*.bin"),
"val_glob": str(output_dir / "fineweb_val_*.bin"),
"vocab_size": int(tok["vocab_size"]),
"bos_id": int(tok["bos_id"]),
"eos_id": int(tok["eos_id"]),
"recommended_bigram_vocab_size": recommended_bigram_vocab_size,
"stats": stats,
}
],
}
manifest = export_module.relativize_manifest_paths(manifest, output_root)
(output_root / "manifest.json").write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8")

target_tokenizer_dir = repo_root / "data" / "tokenizers"
target_dataset_dir = repo_root / "data" / "datasets" / dataset_name
target_tokenizer_dir.mkdir(parents=True, exist_ok=True)
target_dataset_dir.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(tokenizers_dir / "fineweb_4096_bpe.model", target_tokenizer_dir / "fineweb_4096_bpe.model")
shutil.copy2(tokenizers_dir / "fineweb_4096_bpe.vocab", target_tokenizer_dir / "fineweb_4096_bpe.vocab")
if target_dataset_dir.exists():
shutil.rmtree(target_dataset_dir)
shutil.copytree(output_dir, target_dataset_dir)

print(f"tokenizer_model:{target_tokenizer_dir / 'fineweb_4096_bpe.model'}", flush=True)
print(f"dataset_dir:{target_dataset_dir}", flush=True)
print(f"dataset_stats:{json.dumps(stats, sort_keys=True)}", flush=True)


if __name__ == "__main__":
main()
Loading