Ksjsjjdj commited on
Commit
3f30ea1
·
verified ·
1 Parent(s): 0122482

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -3
app.py CHANGED
@@ -18,7 +18,7 @@ import transformers
18
  import datasets
19
  from dotenv import load_dotenv
20
  from datasets import load_dataset, get_dataset_config_names, IterableDataset
21
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback, AutoConfig
22
  from huggingface_hub import login, whoami, create_repo, upload_folder
23
  import spaces
24
 
@@ -190,7 +190,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
190
  if len(batch_buffer) >= 100:
191
  encoded_batch = tokenizer(batch_buffer, truncation=True, max_length=2048, padding=False)
192
  for input_ids in encoded_batch["input_ids"]:
193
- yield {"input_ids": input_ids, "labels": input_ids}
194
  batch_buffer = []
195
  except:
196
  continue
@@ -212,6 +212,8 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
212
 
213
  output_dir = f"checkpoints/{job_id}"
214
 
 
 
215
  training_args = TrainingArguments(
216
  output_dir=output_dir,
217
  per_device_train_batch_size=int(batch_size),
@@ -238,6 +240,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name,
238
  model=original_model,
239
  train_dataset=dataset_iterable,
240
  args=training_args,
 
241
  callbacks=[CustomTrainerCallback(job_id, hf_token, full_repo_id)]
242
  )
243
 
@@ -343,7 +346,7 @@ def load_from_url(request: gr.Request):
343
  pass
344
  return gr.update(selected="launch_tab"), ""
345
 
346
- with gr.Blocks(title="Nucleus Enterprise", theme=gr.themes.Base()) as demo:
347
  with gr.Column():
348
  gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
349
  gr.Markdown("Autonomous LLM Foundry | V7.0 Scratch Edition")
 
18
  import datasets
19
  from dotenv import load_dotenv
20
  from datasets import load_dataset, get_dataset_config_names, IterableDataset
21
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback, AutoConfig, DataCollatorForLanguageModeling
22
  from huggingface_hub import login, whoami, create_repo, upload_folder
23
  import spaces
24
 
 
190
  if len(batch_buffer) >= 100:
191
  encoded_batch = tokenizer(batch_buffer, truncation=True, max_length=2048, padding=False)
192
  for input_ids in encoded_batch["input_ids"]:
193
+ yield {"input_ids": input_ids}
194
  batch_buffer = []
195
  except:
196
  continue
 
212
 
213
  output_dir = f"checkpoints/{job_id}"
214
 
215
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
216
+
217
  training_args = TrainingArguments(
218
  output_dir=output_dir,
219
  per_device_train_batch_size=int(batch_size),
 
240
  model=original_model,
241
  train_dataset=dataset_iterable,
242
  args=training_args,
243
+ data_collator=data_collator,
244
  callbacks=[CustomTrainerCallback(job_id, hf_token, full_repo_id)]
245
  )
246
 
 
346
  pass
347
  return gr.update(selected="launch_tab"), ""
348
 
349
+ with gr.Blocks(title="Nucleus Enterprise") as demo:
350
  with gr.Column():
351
  gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
352
  gr.Markdown("Autonomous LLM Foundry | V7.0 Scratch Edition")