Update app.py
Browse files
app.py
CHANGED
|
@@ -18,7 +18,7 @@ import transformers
|
|
| 18 |
import datasets
|
| 19 |
from dotenv import load_dotenv
|
| 20 |
from datasets import load_dataset, get_dataset_config_names, IterableDataset
|
| 21 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback, AutoConfig
|
| 22 |
from huggingface_hub import login, whoami, create_repo, upload_folder
|
| 23 |
import spaces
|
| 24 |
|
|
@@ -190,7 +190,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
|
|
| 190 |
if len(batch_buffer) >= 100:
|
| 191 |
encoded_batch = tokenizer(batch_buffer, truncation=True, max_length=2048, padding=False)
|
| 192 |
for input_ids in encoded_batch["input_ids"]:
|
| 193 |
-
yield {"input_ids": input_ids
|
| 194 |
batch_buffer = []
|
| 195 |
except:
|
| 196 |
continue
|
|
@@ -212,6 +212,8 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
|
|
| 212 |
|
| 213 |
output_dir = f"checkpoints/{job_id}"
|
| 214 |
|
|
|
|
|
|
|
| 215 |
training_args = TrainingArguments(
|
| 216 |
output_dir=output_dir,
|
| 217 |
per_device_train_batch_size=int(batch_size),
|
|
@@ -238,6 +240,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
|
|
| 238 |
model=original_model,
|
| 239 |
train_dataset=dataset_iterable,
|
| 240 |
args=training_args,
|
|
|
|
| 241 |
callbacks=[CustomTrainerCallback(job_id, hf_token, full_repo_id)]
|
| 242 |
)
|
| 243 |
|
|
@@ -343,7 +346,7 @@ def load_from_url(request: gr.Request):
|
|
| 343 |
pass
|
| 344 |
return gr.update(selected="launch_tab"), ""
|
| 345 |
|
| 346 |
-
with gr.Blocks(title="Nucleus Enterprise"
|
| 347 |
with gr.Column():
|
| 348 |
gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
|
| 349 |
gr.Markdown("Autonomous LLM Foundry | V7.0 Scratch Edition")
|
|
|
|
| 18 |
import datasets
|
| 19 |
from dotenv import load_dotenv
|
| 20 |
from datasets import load_dataset, get_dataset_config_names, IterableDataset
|
| 21 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback, AutoConfig, DataCollatorForLanguageModeling
|
| 22 |
from huggingface_hub import login, whoami, create_repo, upload_folder
|
| 23 |
import spaces
|
| 24 |
|
|
|
|
| 190 |
if len(batch_buffer) >= 100:
|
| 191 |
encoded_batch = tokenizer(batch_buffer, truncation=True, max_length=2048, padding=False)
|
| 192 |
for input_ids in encoded_batch["input_ids"]:
|
| 193 |
+
yield {"input_ids": input_ids}
|
| 194 |
batch_buffer = []
|
| 195 |
except:
|
| 196 |
continue
|
|
|
|
| 212 |
|
| 213 |
output_dir = f"checkpoints/{job_id}"
|
| 214 |
|
| 215 |
+
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
| 216 |
+
|
| 217 |
training_args = TrainingArguments(
|
| 218 |
output_dir=output_dir,
|
| 219 |
per_device_train_batch_size=int(batch_size),
|
|
|
|
| 240 |
model=original_model,
|
| 241 |
train_dataset=dataset_iterable,
|
| 242 |
args=training_args,
|
| 243 |
+
data_collator=data_collator,
|
| 244 |
callbacks=[CustomTrainerCallback(job_id, hf_token, full_repo_id)]
|
| 245 |
)
|
| 246 |
|
|
|
|
| 346 |
pass
|
| 347 |
return gr.update(selected="launch_tab"), ""
|
| 348 |
|
| 349 |
+
with gr.Blocks(title="Nucleus Enterprise") as demo:
|
| 350 |
with gr.Column():
|
| 351 |
gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
|
| 352 |
gr.Markdown("Autonomous LLM Foundry | V7.0 Scratch Edition")
|