Train

Sleeping

App Files Files Community

Ksjsjjdj commited on 20 days ago

Commit

27c9c86

verified ·

1 Parent(s): 3f30ea1

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -10

app.py CHANGED Viewed

@@ -8,11 +8,14 @@ import sys
 import gc
 import multiprocessing
 import shutil
 from datetime import datetime
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from itertools import chain
 import torch
 import gradio as gr
 import transformers
 import datasets
@@ -41,6 +44,34 @@ if torch.cuda.is_available():
 JOBS = {}
 class JobStatus:
     def __init__(self):
         self.id = str(uuid.uuid4())
@@ -110,7 +141,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
     job = JOBS[job_id]
     job.status = "RUNNING"
-    job.add_log("System: initializing Scratch Training Protocol...")
     try:
         if not hf_token.startswith("hf_"):
@@ -195,7 +226,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
                 except:
                     continue
-        job.set_progress(0.15, "Model: Initializing Architecture from Scratch...")
         torch.cuda.empty_cache()
         gc.collect()
@@ -207,6 +238,11 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
             trust_remote_code=True,
         )
         if torch.cuda.is_available():
             original_model = original_model.to(torch.float16).cuda()
@@ -231,20 +267,23 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
             dataloader_num_workers=4,
             dataloader_pin_memory=True,
             gradient_checkpointing=True,
-            torch_compile=False
         )
         dataset_iterable = IterableDataset.from_generator(process_stream_generator)
         trainer = Trainer(
             model=original_model,
             train_dataset=dataset_iterable,
             args=training_args,
             data_collator=data_collator,
             callbacks=[CustomTrainerCallback(job_id, hf_token, full_repo_id)]
         )
-        job.set_progress(0.2, "Training: Full Gradient Descent Initiated...")
         trainer.train()
         trainer.save_model(output_dir)
         tokenizer.save_pretrained(output_dir)
@@ -279,14 +318,14 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
         inject_json(c_tok, "tokenizer_config.json")
         inject_json(c_gen, "generation_config.json")
-        job.set_progress(0.95, "Network: Uploading Final Model...")
         upload_folder(
             folder_path=output_dir,
             path_in_repo=".",
             repo_id=full_repo_id,
             token=hf_token,
-            commit_message="Scratch Trained Model"
         )
         job.repo_url = f"https://huggingface.co/{full_repo_id}"
@@ -346,10 +385,10 @@ def load_from_url(request: gr.Request):
         pass
     return gr.update(selected="launch_tab"), ""
-with gr.Blocks(title="Nucleus Enterprise") as demo:
     with gr.Column():
         gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
-        gr.Markdown("Autonomous LLM Foundry | V7.0 Scratch Edition")
         with gr.Tabs() as main_tabs:
             with gr.TabItem("���� LAUNCHPAD", id="launch_tab"):
@@ -359,7 +398,7 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
                             hf_token = gr.Textbox(label="HuggingFace Token", type="password", value=os.getenv("HF_TOKEN", ""))
                             model_name = gr.Textbox(label="Architecture Config Source", value="Qwen/Qwen2.5-0.5B")
-                        repo_name = gr.Textbox(label="Output Repository", value="nucleus-scratch-v1")
                         datasets = gr.Textbox(label="Datasets (CSV)", value="Salesforce/fineweb_deduplicated", lines=3)
                         reasoning = gr.Checkbox(label="Inject Reasoning (CoT/Math)", value=False)
@@ -374,7 +413,7 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
                     c_tok = gr.Code(label="tokenizer_config.json", language="json")
                     c_gen = gr.Code(label="generation_config.json", language="json")
-                btn_launch = gr.Button("INITIALIZE SCRATCH TRAINING", variant="primary", size="lg")
             with gr.TabItem("📡 TELEMETRY", id="monitor_tab"):
                 with gr.Row():

 import gc
 import multiprocessing
 import shutil
+import math
 from datetime import datetime
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from itertools import chain
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import gradio as gr
 import transformers
 import datasets
 JOBS = {}
+def activation_quant(x):
+    scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
+    y = (x * scale).round().clamp_(-128, 127) / scale
+    return y + x - x.detach()
+def weight_quant(w):
+    scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
+    u = (w * scale).round().clamp_(-1, 1) / scale
+    return u + w - w.detach()
+class BitLinear(nn.Linear):
+    def forward(self, x):
+        w = weight_quant(self.weight)
+        x = activation_quant(x)
+        return F.linear(x, w, self.bias)
+def convert_to_bitnet(model, copy_weights=False):
+    for name, module in model.named_children():
+        if isinstance(module, nn.Linear):
+            bit_linear = BitLinear(module.in_features, module.out_features, module.bias is not None)
+            if copy_weights:
+                bit_linear.weight.data = module.weight.data.clone()
+                if module.bias is not None:
+                    bit_linear.bias.data = module.bias.data.clone()
+            setattr(model, name, bit_linear)
+        else:
+            convert_to_bitnet(module, copy_weights=copy_weights)
 class JobStatus:
     def __init__(self):
         self.id = str(uuid.uuid4())
     job = JOBS[job_id]
     job.status = "RUNNING"
+    job.add_log("System: initializing BitNet Scratch Protocol...")
     try:
         if not hf_token.startswith("hf_"):
                 except:
                     continue
+        job.set_progress(0.15, "Model: Initializing Architecture & Converting to BitNet...")
         torch.cuda.empty_cache()
         gc.collect()
             trust_remote_code=True,
         )
+        convert_to_bitnet(original_model, copy_weights=False)
+        model_size = sum(t.numel() for t in original_model.parameters())
+        job.add_log(f"Model Size: {model_size/1000**2:.1f}M Parameters (1.58-bit)")
         if torch.cuda.is_available():
             original_model = original_model.to(torch.float16).cuda()
             dataloader_num_workers=4,
             dataloader_pin_memory=True,
             gradient_checkpointing=True,
+            torch_compile=False,
+            lr_scheduler_type="cosine",
+            warmup_steps=0.1
         )
         dataset_iterable = IterableDataset.from_generator(process_stream_generator)
         trainer = Trainer(
             model=original_model,
+            tokenizer=tokenizer,
             train_dataset=dataset_iterable,
             args=training_args,
             data_collator=data_collator,
             callbacks=[CustomTrainerCallback(job_id, hf_token, full_repo_id)]
         )
+        job.set_progress(0.2, "Training: BitNet Gradient Descent Initiated...")
         trainer.train()
         trainer.save_model(output_dir)
         tokenizer.save_pretrained(output_dir)
         inject_json(c_tok, "tokenizer_config.json")
         inject_json(c_gen, "generation_config.json")
+        job.set_progress(0.95, "Network: Uploading Final BitNet Model...")
         upload_folder(
             folder_path=output_dir,
             path_in_repo=".",
             repo_id=full_repo_id,
             token=hf_token,
+            commit_message="BitNet Scratch Trained Model"
         )
         job.repo_url = f"https://huggingface.co/{full_repo_id}"
         pass
     return gr.update(selected="launch_tab"), ""
+with gr.Blocks(title="Nucleus Enterprise", theme=gr.themes.Base()) as demo:
     with gr.Column():
         gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
+        gr.Markdown("Autonomous LLM Foundry | V9.0 BitNet Edition")
         with gr.Tabs() as main_tabs:
             with gr.TabItem("���� LAUNCHPAD", id="launch_tab"):
                             hf_token = gr.Textbox(label="HuggingFace Token", type="password", value=os.getenv("HF_TOKEN", ""))
                             model_name = gr.Textbox(label="Architecture Config Source", value="Qwen/Qwen2.5-0.5B")
+                        repo_name = gr.Textbox(label="Output Repository", value="nucleus-bitnet-v1")
                         datasets = gr.Textbox(label="Datasets (CSV)", value="Salesforce/fineweb_deduplicated", lines=3)
                         reasoning = gr.Checkbox(label="Inject Reasoning (CoT/Math)", value=False)
                     c_tok = gr.Code(label="tokenizer_config.json", language="json")
                     c_gen = gr.Code(label="generation_config.json", language="json")
+                btn_launch = gr.Button("INITIALIZE BITNET TRAINING", variant="primary", size="lg")
             with gr.TabItem("📡 TELEMETRY", id="monitor_tab"):
                 with gr.Row():