Ephraimmm commited on
Commit
4d743b7
·
verified ·
1 Parent(s): 893184e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -111
app.py CHANGED
@@ -1,126 +1,92 @@
1
- import torch
2
- import gc
3
- import json
4
- import time
5
- from threading import Thread
6
-
7
  import gradio as gr
 
8
  from unsloth import FastLanguageModel
9
  from transformers import TextIteratorStreamer
10
-
11
- # ---------------------
12
- # Setup + Model Load
13
- # ---------------------
14
-
15
- # Clear out memory before loading
16
- torch.cuda.empty_cache()
17
- gc.collect()
18
-
19
- MODEL_ID = "Ephraimmm/PIDGIN_gemma-3"
20
- CONTEXT_LEN = 128000 # Gemma-3 default context window as per blog
21
-
22
- print("Using Unsloth Gemma-3 model with 128K context window...")
23
-
24
- # Make sure your environment has updated versions:
25
- # pip install -U unsloth unsloth_zoo transformers
26
-
27
- # Load the quantized model with Unsloth
28
- model, tokenizer = FastLanguageModel.from_pretrained(
29
- model_name = MODEL_ID,
30
- max_seq_length = CONTEXT_LEN,
31
- dtype = None, # Let Unsloth pick appropriate dtype
32
- load_in_4bit = True,
33
- trust_remote_code = True,
34
  )
35
- FastLanguageModel.for_inference(model)
36
- print("✅ Model loaded (4-bit dynamic if available)")
37
 
38
- # ---------------------
39
- # Chat Streaming Function
40
- # ---------------------
41
- def stream_chat(message, history):
42
- # Build message list as required by Unsloth
 
 
 
 
 
 
43
  messages = [
44
- {"role": "system", "content": "You be Naija assistant. You must always reply for Pidgin English."}
 
 
 
 
 
 
 
45
  ]
46
- if history:
47
- for human, bot in history:
48
- messages.append({"role": "user", "content": human})
49
- messages.append({"role": "assistant", "content": bot})
50
- messages.append({"role": "user", "content": message})
51
 
52
- # Using apply_chat_template (supported by Unsloth) to handle the formatting
53
  inputs = tokenizer.apply_chat_template(
54
  messages,
55
- add_generation_prompt = True,
56
- return_tensors = "pt"
57
- ).to(model.device)
58
-
59
- streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
60
-
61
- generate_kwargs = dict(
62
- input_ids = inputs,
63
- streamer = streamer,
64
- max_new_tokens = 512,
65
- temperature = 0.8,
66
- do_sample = True,
67
- top_p = 0.9,
 
 
 
68
  )
69
 
70
- # Run in background thread to stream
71
- thread = Thread(target = model.generate, kwargs = generate_kwargs)
72
- thread.start()
73
-
74
- output = ""
75
- for partial in streamer:
76
- output += partial
77
- yield output
78
-
79
- # ---------------------
80
- # Save chat to file (JSON format)
81
- # ---------------------
82
- def save_chat(history):
83
- export = []
84
- for human, bot in history:
85
- export.append({"role": "user", "content": human})
86
- export.append({"role": "assistant", "content": bot})
87
-
88
- timestamp = time.strftime("%Y%m%d-%H%M%S")
89
- fname = f"conversation_{timestamp}.json"
90
- with open(fname, "w", encoding="utf-8") as f:
91
- json.dump(export, f, ensure_ascii=False, indent=2)
92
- return fname
93
 
94
- # ---------------------
95
- # UI with Gradio
96
- # ---------------------
97
- with gr.Blocks(title="🇳🇬 PIDGIN Gemma-3 Chatbot") as demo:
98
- gr.HTML("<h1><center>🇳🇬 PIDGIN Gemma-3 Chatbot</center></h1>")
99
- chatbot = gr.Chatbot(height=450, show_label=False)
100
-
101
- with gr.Row():
102
- msg = gr.Textbox(placeholder="Type your message here...", lines=2, scale=4)
103
- send = gr.Button("Send", variant="primary", scale=1, size="lg")
104
-
105
- with gr.Row():
106
- clear = gr.Button("Clear Chat", variant="secondary", scale=1)
107
- save_btn = gr.Button("💾 Save Conversation", variant="secondary", scale=1)
108
- download_file = gr.File()
109
-
110
- def respond(message, history):
111
- if history is None:
112
- history = []
113
- stream = stream_chat(message, history)
114
- response = ""
115
- for partial in stream:
116
- response = partial
117
- yield history + [(message, response)], ""
118
- yield history + [(message, response)], ""
119
 
120
- msg.submit(respond, [msg, chatbot], [chatbot, msg])
121
- send.click(respond, [msg, chatbot], [chatbot, msg])
122
- clear.click(lambda: ([], ""), outputs=[chatbot, msg])
123
- save_btn.click(save_chat, inputs=[chatbot], outputs=[download_file])
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- if __name__ == "__main__":
126
- demo.launch(share=True, debug=True)
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
  from unsloth import FastLanguageModel
4
  from transformers import TextIteratorStreamer
5
+ import threading
6
+ from peft import PeftModel
7
+
8
+ # -----------------------------
9
+ # 1️⃣ Set device
10
+ # -----------------------------
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ print("Using device:", device)
13
+
14
+ # -----------------------------
15
+ # 2️⃣ Load base model (skip compilation)
16
+ # -----------------------------
17
+ base_model_name = "Ephraimmm/PIDGIN_gemma-3"
18
+ base_model, tokenizer = FastLanguageModel.from_pretrained(
19
+ model_name=base_model_name,
20
+ max_seq_length=2048,
21
+ dtype=torch.float16,
22
+ load_in_4bit=False,
23
+ disable_compile=True # <- avoids unsloth compilation errors
 
 
 
 
 
24
  )
 
 
25
 
26
+ # -----------------------------
27
+ # 3️⃣ Load LoRA
28
+ # -----------------------------
29
+ lora_repo = "Ephraimmm/PIDGIN_gemma-3"
30
+ lora_model = PeftModel.from_pretrained(base_model, lora_repo, adapter_name="adapter_model")
31
+ FastLanguageModel.for_inference(lora_model)
32
+
33
+ # -----------------------------
34
+ # 4️⃣ Streaming generation function with Nigerian Pidgin system prompt
35
+ # -----------------------------
36
+ def generate_response(user_message):
37
  messages = [
38
+ {
39
+ "role": "system",
40
+ "content": [{"type": "text", "text": "You be Nigerian assistant wey sabi Pidgin English only. No speak any other language."}]
41
+ },
42
+ {
43
+ "role": "user",
44
+ "content": [{"type": "text", "text": user_message}]
45
+ }
46
  ]
 
 
 
 
 
47
 
 
48
  inputs = tokenizer.apply_chat_template(
49
  messages,
50
+ add_generation_prompt=True,
51
+ return_tensors="pt",
52
+ tokenize=True,
53
+ return_dict=True
54
+ ).to(device)
55
+
56
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
57
+
58
+ generation_kwargs = dict(
59
+ **inputs,
60
+ streamer=streamer,
61
+ max_new_tokens=256,
62
+ temperature=0.7,
63
+ top_p=0.9,
64
+ top_k=40,
65
+ use_cache=False
66
  )
67
 
68
+ def generate():
69
+ lora_model.generate(**generation_kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ thread = threading.Thread(target=generate)
72
+ thread.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ full_response = ""
75
+ for new_token in streamer:
76
+ if new_token:
77
+ full_response += new_token
78
+ thread.join()
79
+ return full_response
80
+
81
+ # -----------------------------
82
+ # 5️⃣ Gradio interface
83
+ # -----------------------------
84
+ iface = gr.Interface(
85
+ fn=generate_response,
86
+ inputs=gr.Textbox(lines=2, placeholder="Enter your message..."),
87
+ outputs=gr.Textbox(label="PIDGIN Assistant"),
88
+ title="Nigerian PIDGIN Assistant",
89
+ description="Chat with a Nigerian assistant that only speaks Pidgin English."
90
+ )
91
 
92
+ iface.launch()