hftestbackend

Runtime error

App Files Files Community

hftestbackend / app.py

Sergidev

Update app.py

66f04b1 verified about 1 year ago

raw

history blame contribute delete

4.12 kB

	import os
	from threading import Thread
	from typing import Iterator
	import gradio as gr
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	DESCRIPTION = """\
	# Llama backend

	This is a demo of text completion with AI LLM's.

	Enter your text in the box below and click "Complete" to have the AI generate a completion for your input. The generated text will be appended to your input. You can stop the generation at any time by clicking the "Stop" button.
	"""

	MAX_MAX_NEW_TOKENS = 2048
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	model_id = "meta-llama/Llama-3.1-8B"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype=torch.float16,
	load_in_8bit=True,
	)

	model.eval()
	@spaces.GPU
	def generate(
	message: str,
	max_new_tokens: int = 1024,
	temperature: float = 0.6,
	top_p: float = 0.1,
	top_k: int = 50,
	repetition_penalty: float = 1.2,
	) -> Iterator[str]:
	prompt = f"{message}"
	input_ids = tokenizer.encode(prompt, return_tensors="pt")
	if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
	input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
	gr.Warning(f"Trimmed input as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
	input_ids = input_ids.to(model.device)

	streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	input_ids=input_ids,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	top_k=top_k,
	temperature=temperature,
	num_beams=1,
	repetition_penalty=repetition_penalty,
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	partial_message = message
	for text in streamer:
	partial_message += text
	yield partial_message

	with gr.Blocks(css="style.css", fill_height=True) as demo:
	gr.Markdown(DESCRIPTION)
	gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")

	with gr.Row():
	with gr.Column(scale=4):
	text_box = gr.Textbox(
	label="Enter your text",
	placeholder="Type your message here...",
	lines=5
	)
	with gr.Column(scale=1):
	complete_button = gr.Button("Complete")
	stop_button = gr.Button("Stop")
	max_new_tokens = gr.Slider(
	label="Max new tokens",
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	step=1,
	value=DEFAULT_MAX_NEW_TOKENS,
	)
	temperature = gr.Slider(
	label="Temperature",
	minimum=0.1,
	maximum=1.0,
	step=0.1,
	value=0.1,
	)
	top_p = gr.Slider(
	label="Top-p (nucleus sampling)",
	minimum=0.05,
	maximum=1.0,
	step=0.05,
	value=0.9,
	)
	top_k = gr.Slider(
	label="Top-k",
	minimum=1,
	maximum=100, # Changed from 1000 to 100
	step=1,
	value=50,
	)
	repetition_penalty = gr.Slider(
	label="Repetition penalty",
	minimum=1.0,
	maximum=2.0,
	step=0.05,
	value=1.2,
	)

	# Set up the generation event
	generation_event = complete_button.click(
	generate,
	inputs=[text_box, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
	outputs=[text_box],
	)

	# Set up the stop event
	stop_button.click(
	None,
	None,
	None,
	cancels=[generation_event]
	)

	if __name__ == "__main__":
	demo.queue(max_size=20).launch()