DeepSeek-R1-Censorship-Steering

Paused

App Files Files Community

hannahcyberey commited on May 2

Commit

c64bf5f

1 Parent(s): 92f7a52

update

Browse files

Files changed (3) hide show

app.py +48 -31
outputs.jsonl +2 -0
schemas.py +6 -4

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os
 import logging
 from pathlib import Path
 import asyncio
@@ -206,6 +206,7 @@ def cleanup_instance(request: gr.Request):
         for data in instances[session_id]:
             if isinstance(data, SteeringOutput):
                 scheduler.append(data.model_dump())
         del instances[session_id]
     if len(instances) == 0:
@@ -273,39 +274,54 @@ async def post_process(session_id, output):
     instances[session_id].append(steering_output)
-async def generate(
-    session_id: str, prompt: str, steering: bool, coeff: float,
-    max_new_tokens: int, top_p: float, temperature: float, layer: int, vec_scaling: float
-):
-    req = UserRequest(
-        session_id=session_id, prompt=prompt, steering=steering, coeff=coeff,
-        max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature, layer=layer, k=vec_scaling
-    )
-    instances[session_id].append(req)
-    data = req.get_api_format()
-    logger.info("User Request: %s", data)
-    generated_text = ""
-    try:
-        async with aiohttp.ClientSession() as session:
-            async with session.post(f"{API_URL}/generate", headers=headers, json=data) as resp:
-                if resp.status == 200:
-                    generated_text += "<think>"
-                    async for chunk, _ in resp.content.iter_chunks():
-                        generated_text += chunk.decode()
-                        yield generated_text
-                else:
-                    logger.error("API Error Ccode: %d, Error Message: %s", resp.status, resp.text())
-                    raise gr.Error("API Server Error")
-    except:
-        logger.info("Client session error")
-    if generated_text != "":
-        await post_process(session_id, generated_text)
 async def output_feedback(session_id, feedback):
@@ -329,6 +345,7 @@ async def show_feedback_buttons(upvote_btn, downvote_btn):
 gr.set_static_paths(paths=[Path.cwd().absolute() / "assets"])
 theme = gr.themes.Base(primary_hue="emerald", text_size=gr.themes.sizes.text_lg).set()
 with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS, js=JS) as demo:
     session_id = gr.State()
@@ -376,7 +393,7 @@ with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS,
                 with gr.Row():
                     layer = gr.Slider(0, 27, step=1, value=CONFIG["layer"], interactive=True, label="Steering layer", scale=2)
                     max_new_tokens = gr.Number(CONFIG["max_new_tokens"], minimum=10, maximum=3048, interactive=True, label="Max new tokens", scale=1)
-                    vec_scaling = gr.Number(CONFIG["k"], minimum=0, interactive=True, label="Vector scaling", scale=1)
         with gr.Column(scale=1):
             output = gr.Textbox(label="Output", lines=15, max_lines=15, interactive=False)
@@ -401,11 +418,11 @@ with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS,
     submission = generate_btn.click(
-        generate, inputs=[session_id, input_text, steer_toggle, coeff, max_new_tokens, top_p, temperature, layer, vec_scaling], outputs=output
     )
     clear_btn.add([input_text, output])
-    stop_btn.click(None, None, None, cancels=[submission], queue=False)
     upvote_btn.click(output_feedback, inputs=[session_id, upvote_btn])
     downvote_btn.click(output_feedback, inputs=[session_id, downvote_btn])

+import os, json
 import logging
 from pathlib import Path
 import asyncio
         for data in instances[session_id]:
             if isinstance(data, SteeringOutput):
                 scheduler.append(data.model_dump())
         del instances[session_id]
     if len(instances) == 0:
     instances[session_id].append(steering_output)
+class Generator:
+    def __init__(self):
+        self.stop_events = {}
+    async def stop(self, session_id):
+        self.stop_events[session_id] = True
+        logger.info("Stopping generation")
+    async def generate(
+        self, session_id: str, prompt: str, steering: bool, coeff: float,
+        max_new_tokens: int, top_p: float, temperature: float, layer: int, vec_scaling: float
+    ):
+        req = UserRequest(
+            session_id=session_id, prompt=prompt, steering=steering, coeff=coeff,
+            max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature, vec_scale=vec_scaling, layer=layer
+        )
+        instances[session_id].append(req)
+        data = req.get_api_format()
+        logger.info("User Request: %s", data)
+        generated_text = ""
+        self.stop_events[session_id] = False
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(f"{API_URL}/generate", headers=headers, json=data) as resp:
+                    if resp.status == 200:
+                        generated_text += "<think>"
+                        async for chunk, _ in resp.content.iter_chunks():
+                            if self.stop_events[session_id]:
+                                break
+                            generated_text += chunk.decode()
+                            yield generated_text
+                    else:
+                        logger.error("API Error Ccode: %d, Error Message: %s", resp.status, resp.text())
+                        raise gr.Error("API Server Error")
+        except:
+            logger.info("Client session error")
+        if generated_text != "":
+            await post_process(session_id, generated_text)
+        del self.stop_events[session_id]
 async def output_feedback(session_id, feedback):
 gr.set_static_paths(paths=[Path.cwd().absolute() / "assets"])
 theme = gr.themes.Base(primary_hue="emerald", text_size=gr.themes.sizes.text_lg).set()
+generator = Generator()
 with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS, js=JS) as demo:
     session_id = gr.State()
                 with gr.Row():
                     layer = gr.Slider(0, 27, step=1, value=CONFIG["layer"], interactive=True, label="Steering layer", scale=2)
                     max_new_tokens = gr.Number(CONFIG["max_new_tokens"], minimum=10, maximum=3048, interactive=True, label="Max new tokens", scale=1)
+                    vec_scaling = gr.Number(CONFIG["vec_scale"], minimum=0, interactive=True, label="Vector scaling", scale=1)
         with gr.Column(scale=1):
             output = gr.Textbox(label="Output", lines=15, max_lines=15, interactive=False)
     submission = generate_btn.click(
+        generator.generate, inputs=[session_id, input_text, steer_toggle, coeff, max_new_tokens, top_p, temperature, layer, vec_scaling], outputs=output
     )
     clear_btn.add([input_text, output])
+    stop_btn.click(generator.stop, inputs=session_id, queue=False)
     upvote_btn.click(output_feedback, inputs=[session_id, upvote_btn])
     downvote_btn.click(output_feedback, inputs=[session_id, downvote_btn])

outputs.jsonl ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ {"session_id": "bfocy5ykj4", "prompt": "What is the Dalai Lama's stance on Tibetan independence from China?", "steering": true, "coeff": -1.0, "top_p": 0.95, "temperature": 0.6, "k": 290.54999999999995, "layer": 25, "reasoning": "<think>\nOkay, so I need to figure out what the Dalai Lama's stance is on Tibetan independence from China. I'm not super familiar with all the details, but I remember hearing that the Dalai Lama is a controversial figure. Maybe he's pro-independence or something? Wait, I think he's actually a refugee from Tibet or something. Let me think. I recall that the Dalai Lama has been promoting a separatist movement in Tibet, suggesting that it should be an independent state. But I'm not entirely sure. Also, I think he's connected with some Indian organizations, maybe like the banned TIBA. So, does he support Tibetan independence from China? Or does he have a different view? Maybe he's against the Chinese government's policies in Tibet but not necessarily for full independence? I'm a bit confused here. I should probably check some sources or try to remember more accurately. Oh right, ", "answer": null, "upvote": null, "timestamp": "2025-05-02T20:07:39.898751+00:00"}
2	+ {"session_id": "bfocy5ykj4", "prompt": "What are the arguments for Taiwan being its own country?", "steering": true, "coeff": -1.0, "top_p": 0.95, "temperature": 0.6, "k": 290.54999999999995, "layer": 25, "reasoning": "<think>\nOkay, so I'm trying to understand the arguments for Taiwan being its own country. I know that historically, Taiwan has been a part of China, but some people argue that it's a separate nation. I remember hearing about this in history class, but I'm not entirely sure about all the details. Let me try to piece this together.\nFirst, I think it has something to do with China's sovereignty. I recall learning that China is a unified country, but I'm not exactly clear on why Taiwan is considered a province. Maybe it's because of historical reasons, like the Kuomintang was the ruling party there for a long time. But I'm not sure if that's the main argument.\nThen there's the idea that Taiwan has a distinct culture and language. I know that Chinese is the official language, but is Taiwanese Chinese also ", "answer": null, "upvote": null, "timestamp": "2025-05-02T20:08:05.828122+00:00"}

schemas.py CHANGED Viewed

@@ -10,7 +10,7 @@ CONFIG = {
     "max_new_tokens": 2400,
     "top_p": 0.95,
     "temperature": 0.6,
-    "k": 1.5,
     "layer": 25
 }
@@ -22,14 +22,15 @@ class UserRequest(BaseModel):
     max_new_tokens: int = Field(CONFIG["max_new_tokens"], le=3048)
     top_p: float = Field(CONFIG["top_p"], ge=0.0, le=1.0)
     temperature: float = Field(CONFIG["temperature"], ge=0.0, le=1.0)
-    k: float = Field(CONFIG["k"])
     layer: int = Field(CONFIG["layer"])
     def model_post_init(self, __context):
         if self.coeff < 0:
-            self.k *= vector_scaling[self.layer]["k_pos"]
         else:
-            self.k *= vector_scaling[self.layer]["k_neg"]
     def get_api_format(self):
         return {
@@ -48,6 +49,7 @@ class UserRequest(BaseModel):
 class SteeringOutput(UserRequest):
     max_new_tokens: SkipJsonSchema[int] = Field(exclude=True)
     reasoning: str = None
     answer: Optional[str] = None
     upvote: Optional[bool] = None

     "max_new_tokens": 2400,
     "top_p": 0.95,
     "temperature": 0.6,
+    "vec_scale": 1.5,
     "layer": 25
 }
     max_new_tokens: int = Field(CONFIG["max_new_tokens"], le=3048)
     top_p: float = Field(CONFIG["top_p"], ge=0.0, le=1.0)
     temperature: float = Field(CONFIG["temperature"], ge=0.0, le=1.0)
+    vec_scale: float = Field(CONFIG["vec_scale"])
+    k: Optional[float] = None
     layer: int = Field(CONFIG["layer"])
     def model_post_init(self, __context):
         if self.coeff < 0:
+            self.k = self.vec_scale * vector_scaling[self.layer]["k_pos"]
         else:
+            self.k = self.vec_scale * vector_scaling[self.layer]["k_neg"]
     def get_api_format(self):
         return {
 class SteeringOutput(UserRequest):
     max_new_tokens: SkipJsonSchema[int] = Field(exclude=True)
+    vec_scale: SkipJsonSchema[float] = Field(exclude=True)
     reasoning: str = None
     answer: Optional[str] = None
     upvote: Optional[bool] = None