llmq/scripts/demo.py at dev · IST-DASLab/llmq · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = ["pyllmq", "wandb", "tqdm"]
# ///

# Note: make sure pyllmq.cpython... is on the path, or run with uv
import pyllmq
import numpy as np
import time
from tqdm import tqdm
from pathlib import Path


def main():
    # configure the model architecture
    config = pyllmq.Config(
        architecture="qwen2",
        hidden_size=896,
        intermediate_size=4864,
        max_position_embeddings=32768,
        num_attention_heads=14,
        num_hidden_layers=24,
        num_key_value_heads=2,
        rms_norm_eps=1e-06,
        tie_word_embeddings=True,
        dtype="bfloat16",
        vocab_size=151936
    )

    # configure the training options
    options = pyllmq.LLamaOptions()

    # more options
    ngpu = 1
    batch_size = 2
    seq_len = 1024
    grad_accumulation = 4
    steps = 100
    eval_steps = 10
    grad_clip = 1.0
    weight_decay = 1e-2
    beta_1 = 0.9
    beta_2 = 0.95
    lr = 1e-5

    # ensure training data exists
    if not Path("data/tiny-stories-qwen").exists():
        print("generating training data...")
        from .tokenize_data import generate_tokenized_dataset
        generate_tokenized_dataset( "tiny-stories", "qwen")

    tiny_stories = list(map(str, Path("data/tiny-stories-qwen").glob("train-*.bin")))

    # set up the data loaders. it is _not_ required to use pyllmq.DataLoader, you can
    # fill in_tokens and out_tokens yourself;
    in_tokens = np.empty((ngpu * batch_size, seq_len), dtype=np.int32)
    out_tokens = np.empty((ngpu * batch_size, seq_len), dtype=np.int32)
    train_loader = pyllmq.DataLoader(tiny_stories, ngpu * batch_size * seq_len, 42)
    eval_loader = pyllmq.DataLoader(["data/tiny-stories-qwen/eval.bin"], ngpu * batch_size * seq_len, 42)

    # create the trainer object and initialize the weights
    print("creating trainer...")
    trainer = pyllmq.LLMQTrainer(ngpu=ngpu, config=config, options=options, batch_size=batch_size, seq_len=seq_len, grad_accum=grad_accumulation)
    print("initializing weights...")
    trainer.init_weights()
    # alternative: pyllmq.LLMQTrainer.from_pretrained("Qwen/Qwen2.5-0.5B", ngpu=ngpu, dtype="bf16", options=options, batch_size=2, seq_len=1024, grad_accum=grad_accumulation)

    print("\nmemory consumption:")
    for k, v in trainer.get_allocator_info(0).items():
        print(f" {k:20}: {v['device'] // 1024 // 1024:6} MiB")

    train_loader.load_batch(in_tokens, out_tokens)

    print("\nstarting training...\n")
    start = time.perf_counter()
    for step in range(steps):
        for s in range(grad_accumulation):
            trainer.step(in_tokens, out_tokens)
            # overlap next batch loading with step
            train_loader.load_batch(in_tokens, out_tokens)

        # hide latency by doing this before update()
        train_loader.load_batch(in_tokens, out_tokens)
        if step % 10 == 0:
            infos = trainer.get_gpu_info()
            for info in infos:
                print(f"             power: {info.power // 1000:4}W  temp: {info.temperature:3}°C   rx: {info.pcie_rx // 1024 // 1024:4}MiB/s  tx: {info.pcie_tx // 1024 // 1024:4}MiB/s")
                print(f"             clock: {info.clock / 1000:3.1f}GHz fan:  {info.fan:3}%    throttle: {info.throttle_reason}")

        result = trainer.update(lr, beta_1, beta_2, step + 1, weight_decay, grad_clip)
        duration = time.perf_counter() - start
        start = time.perf_counter()
        print(f"step: {step:5}  loss: {result['loss']:6.3f}  norm: {result['norm']:6.3f}  time: {duration:6.3f}s")


    val_loss = 0.0
    print("\neval...")
    for i in tqdm(range(min(eval_steps, eval_loader.num_chunks-1))):
        eval_loader.load_batch(in_tokens, out_tokens)
        val_loss += trainer.validate(in_tokens, out_tokens)[0]

    print(f"eval loss: {val_loss / eval_steps:6.3f}")

    trainer.export_model("demo-model")


if __name__ == "__main__":
    main()