Spaces:

nroggendorff
/

smallama

Paused

App Files Files Community

nroggendorff commited on Oct 13

Commit

2b46203

1 Parent(s): 5e16e26

initial commit

Browse files

Files changed (3) hide show

README.md +6 -8
app.py +54 -17
requirements.txt +2 -0

README.md CHANGED Viewed

@@ -1,15 +1,13 @@
 ---
-title: Gradio Chatbot
 emoji: 💬
-colorFrom: yellow
-colorTo: purple
 sdk: gradio
-sdk_version: 5.42.0
 app_file: app.py
 pinned: false
-hf_oauth: true
-hf_oauth_scopes:
- - inference-api
 ---
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

 ---
+title: Smallama
 emoji: 💬
+colorFrom: red
+colorTo: blue
 sdk: gradio
+sdk_version: 5.49.1
 app_file: app.py
 pinned: false
+license: mit
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,5 +1,56 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
 def respond(
@@ -9,17 +60,10 @@ def respond(
     max_tokens,
     temperature,
     top_p,
-    hf_token: gr.OAuthToken,
 ):
-    """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-    """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
     messages = [{"role": "system", "content": system_message}]
     messages.extend(history)
     messages.append({"role": "user", "content": message})
     response = ""
@@ -35,21 +79,16 @@ def respond(
         token = ""
         if len(choices) and choices[0].delta.content:
             token = choices[0].delta.content
         response += token
         yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
 chatbot = gr.ChatInterface(
     respond,
     type="messages",
     additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
@@ -61,8 +100,6 @@ chatbot = gr.ChatInterface(
 )
 with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
     chatbot.render()

 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from spaces import GPU as gpu
+class Delta:
+    def __init__(self, content):
+        self.content = content
+class Choice:
+    def __init__(self, delta):
+        self.delta = delta
+class InferenceClient:
+    def __init__(self, model_id="nroggendorff/smallama-it"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.model = AutoModelForCausalLM.from_pretrained(model_id)
+    class ModelOutput:
+        def __init__(self, client, inputs):
+            self.client = client
+            self.inputs = inputs
+            self.choices = []
+        def decode(self, output):
+            decoded_output = self.client.tokenizer.decode(
+                output[0][self.inputs["input_ids"].shape[-1] :],
+                skip_special_tokens=True,
+            )
+            self.choices = [Choice(Delta(decoded_output))]
+            return self
+    @gpu
+    def chat_completion(
+        self, messages, max_tokens=256, stream=True, temperature=0.2, top_p=0.95
+    ):
+        inputs = self.tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(self.model.device)
+        model_output = self.ModelOutput(self, inputs)
+        for _ in range(max_tokens):
+            output = self.model.generate(
+                **inputs, max_new_tokens=1, temperature=temperature, top_p=top_p
+            )
+            yield model_output.decode(output)
 def respond(
     max_tokens,
     temperature,
     top_p,
 ):
+    client = InferenceClient()
     messages = [{"role": "system", "content": system_message}]
     messages.extend(history)
     messages.append({"role": "user", "content": message})
     response = ""
         token = ""
         if len(choices) and choices[0].delta.content:
             token = choices[0].delta.content
         response += token
         yield response
 chatbot = gr.ChatInterface(
     respond,
     type="messages",
     additional_inputs=[
         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.2, step=0.1, label="Temperature"),
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
 )
 with gr.Blocks() as demo:
     chatbot.render()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ git+https://github.com/huggingface/transformers
2	+ accelerate