Ksjsjjdj commited on
Commit
27c9c86
·
verified ·
1 Parent(s): 3f30ea1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -10
app.py CHANGED
@@ -8,11 +8,14 @@ import sys
8
  import gc
9
  import multiprocessing
10
  import shutil
 
11
  from datetime import datetime
12
  from concurrent.futures import ThreadPoolExecutor, as_completed
13
  from itertools import chain
14
 
15
  import torch
 
 
16
  import gradio as gr
17
  import transformers
18
  import datasets
@@ -41,6 +44,34 @@ if torch.cuda.is_available():
41
 
42
  JOBS = {}
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  class JobStatus:
45
  def __init__(self):
46
  self.id = str(uuid.uuid4())
@@ -110,7 +141,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
110
 
111
  job = JOBS[job_id]
112
  job.status = "RUNNING"
113
- job.add_log("System: initializing Scratch Training Protocol...")
114
 
115
  try:
116
  if not hf_token.startswith("hf_"):
@@ -195,7 +226,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
195
  except:
196
  continue
197
 
198
- job.set_progress(0.15, "Model: Initializing Architecture from Scratch...")
199
 
200
  torch.cuda.empty_cache()
201
  gc.collect()
@@ -207,6 +238,11 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
207
  trust_remote_code=True,
208
  )
209
 
 
 
 
 
 
210
  if torch.cuda.is_available():
211
  original_model = original_model.to(torch.float16).cuda()
212
 
@@ -231,20 +267,23 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
231
  dataloader_num_workers=4,
232
  dataloader_pin_memory=True,
233
  gradient_checkpointing=True,
234
- torch_compile=False
 
 
235
  )
236
 
237
  dataset_iterable = IterableDataset.from_generator(process_stream_generator)
238
 
239
  trainer = Trainer(
240
  model=original_model,
 
241
  train_dataset=dataset_iterable,
242
  args=training_args,
243
  data_collator=data_collator,
244
  callbacks=[CustomTrainerCallback(job_id, hf_token, full_repo_id)]
245
  )
246
 
247
- job.set_progress(0.2, "Training: Full Gradient Descent Initiated...")
248
  trainer.train()
249
  trainer.save_model(output_dir)
250
  tokenizer.save_pretrained(output_dir)
@@ -279,14 +318,14 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
279
  inject_json(c_tok, "tokenizer_config.json")
280
  inject_json(c_gen, "generation_config.json")
281
 
282
- job.set_progress(0.95, "Network: Uploading Final Model...")
283
 
284
  upload_folder(
285
  folder_path=output_dir,
286
  path_in_repo=".",
287
  repo_id=full_repo_id,
288
  token=hf_token,
289
- commit_message="Scratch Trained Model"
290
  )
291
 
292
  job.repo_url = f"https://huggingface.co/{full_repo_id}"
@@ -346,10 +385,10 @@ def load_from_url(request: gr.Request):
346
  pass
347
  return gr.update(selected="launch_tab"), ""
348
 
349
- with gr.Blocks(title="Nucleus Enterprise") as demo:
350
  with gr.Column():
351
  gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
352
- gr.Markdown("Autonomous LLM Foundry | V7.0 Scratch Edition")
353
 
354
  with gr.Tabs() as main_tabs:
355
  with gr.TabItem("���� LAUNCHPAD", id="launch_tab"):
@@ -359,7 +398,7 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
359
  hf_token = gr.Textbox(label="HuggingFace Token", type="password", value=os.getenv("HF_TOKEN", ""))
360
  model_name = gr.Textbox(label="Architecture Config Source", value="Qwen/Qwen2.5-0.5B")
361
 
362
- repo_name = gr.Textbox(label="Output Repository", value="nucleus-scratch-v1")
363
  datasets = gr.Textbox(label="Datasets (CSV)", value="Salesforce/fineweb_deduplicated", lines=3)
364
 
365
  reasoning = gr.Checkbox(label="Inject Reasoning (CoT/Math)", value=False)
@@ -374,7 +413,7 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
374
  c_tok = gr.Code(label="tokenizer_config.json", language="json")
375
  c_gen = gr.Code(label="generation_config.json", language="json")
376
 
377
- btn_launch = gr.Button("INITIALIZE SCRATCH TRAINING", variant="primary", size="lg")
378
 
379
  with gr.TabItem("📡 TELEMETRY", id="monitor_tab"):
380
  with gr.Row():
 
8
  import gc
9
  import multiprocessing
10
  import shutil
11
+ import math
12
  from datetime import datetime
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
14
  from itertools import chain
15
 
16
  import torch
17
+ import torch.nn as nn
18
+ import torch.nn.functional as F
19
  import gradio as gr
20
  import transformers
21
  import datasets
 
44
 
45
  JOBS = {}
46
 
47
+ def activation_quant(x):
48
+ scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
49
+ y = (x * scale).round().clamp_(-128, 127) / scale
50
+ return y + x - x.detach()
51
+
52
+ def weight_quant(w):
53
+ scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
54
+ u = (w * scale).round().clamp_(-1, 1) / scale
55
+ return u + w - w.detach()
56
+
57
+ class BitLinear(nn.Linear):
58
+ def forward(self, x):
59
+ w = weight_quant(self.weight)
60
+ x = activation_quant(x)
61
+ return F.linear(x, w, self.bias)
62
+
63
+ def convert_to_bitnet(model, copy_weights=False):
64
+ for name, module in model.named_children():
65
+ if isinstance(module, nn.Linear):
66
+ bit_linear = BitLinear(module.in_features, module.out_features, module.bias is not None)
67
+ if copy_weights:
68
+ bit_linear.weight.data = module.weight.data.clone()
69
+ if module.bias is not None:
70
+ bit_linear.bias.data = module.bias.data.clone()
71
+ setattr(model, name, bit_linear)
72
+ else:
73
+ convert_to_bitnet(module, copy_weights=copy_weights)
74
+
75
  class JobStatus:
76
  def __init__(self):
77
  self.id = str(uuid.uuid4())
 
141
 
142
  job = JOBS[job_id]
143
  job.status = "RUNNING"
144
+ job.add_log("System: initializing BitNet Scratch Protocol...")
145
 
146
  try:
147
  if not hf_token.startswith("hf_"):
 
226
  except:
227
  continue
228
 
229
+ job.set_progress(0.15, "Model: Initializing Architecture & Converting to BitNet...")
230
 
231
  torch.cuda.empty_cache()
232
  gc.collect()
 
238
  trust_remote_code=True,
239
  )
240
 
241
+ convert_to_bitnet(original_model, copy_weights=False)
242
+
243
+ model_size = sum(t.numel() for t in original_model.parameters())
244
+ job.add_log(f"Model Size: {model_size/1000**2:.1f}M Parameters (1.58-bit)")
245
+
246
  if torch.cuda.is_available():
247
  original_model = original_model.to(torch.float16).cuda()
248
 
 
267
  dataloader_num_workers=4,
268
  dataloader_pin_memory=True,
269
  gradient_checkpointing=True,
270
+ torch_compile=False,
271
+ lr_scheduler_type="cosine",
272
+ warmup_steps=0.1
273
  )
274
 
275
  dataset_iterable = IterableDataset.from_generator(process_stream_generator)
276
 
277
  trainer = Trainer(
278
  model=original_model,
279
+ tokenizer=tokenizer,
280
  train_dataset=dataset_iterable,
281
  args=training_args,
282
  data_collator=data_collator,
283
  callbacks=[CustomTrainerCallback(job_id, hf_token, full_repo_id)]
284
  )
285
 
286
+ job.set_progress(0.2, "Training: BitNet Gradient Descent Initiated...")
287
  trainer.train()
288
  trainer.save_model(output_dir)
289
  tokenizer.save_pretrained(output_dir)
 
318
  inject_json(c_tok, "tokenizer_config.json")
319
  inject_json(c_gen, "generation_config.json")
320
 
321
+ job.set_progress(0.95, "Network: Uploading Final BitNet Model...")
322
 
323
  upload_folder(
324
  folder_path=output_dir,
325
  path_in_repo=".",
326
  repo_id=full_repo_id,
327
  token=hf_token,
328
+ commit_message="BitNet Scratch Trained Model"
329
  )
330
 
331
  job.repo_url = f"https://huggingface.co/{full_repo_id}"
 
385
  pass
386
  return gr.update(selected="launch_tab"), ""
387
 
388
+ with gr.Blocks(title="Nucleus Enterprise", theme=gr.themes.Base()) as demo:
389
  with gr.Column():
390
  gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
391
+ gr.Markdown("Autonomous LLM Foundry | V9.0 BitNet Edition")
392
 
393
  with gr.Tabs() as main_tabs:
394
  with gr.TabItem("���� LAUNCHPAD", id="launch_tab"):
 
398
  hf_token = gr.Textbox(label="HuggingFace Token", type="password", value=os.getenv("HF_TOKEN", ""))
399
  model_name = gr.Textbox(label="Architecture Config Source", value="Qwen/Qwen2.5-0.5B")
400
 
401
+ repo_name = gr.Textbox(label="Output Repository", value="nucleus-bitnet-v1")
402
  datasets = gr.Textbox(label="Datasets (CSV)", value="Salesforce/fineweb_deduplicated", lines=3)
403
 
404
  reasoning = gr.Checkbox(label="Inject Reasoning (CoT/Math)", value=False)
 
413
  c_tok = gr.Code(label="tokenizer_config.json", language="json")
414
  c_gen = gr.Code(label="generation_config.json", language="json")
415
 
416
+ btn_launch = gr.Button("INITIALIZE BITNET TRAINING", variant="primary", size="lg")
417
 
418
  with gr.TabItem("📡 TELEMETRY", id="monitor_tab"):
419
  with gr.Row():