Commit
·
a137c8f
1
Parent(s):
95f0c53
update
Browse files- app.py +1 -1
- assets/vector_scaling.json +142 -0
- schemas.py +10 -1
app.py
CHANGED
|
@@ -349,7 +349,7 @@ with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS,
|
|
| 349 |
with gr.Row():
|
| 350 |
layer = gr.Slider(0, 27, step=1, value=CONFIG["layer"], interactive=True, label="Steering layer", scale=2)
|
| 351 |
max_new_tokens = gr.Number(CONFIG["max_new_tokens"], minimum=10, maximum=CONFIG["max_new_tokens"], interactive=True, label="Max new tokens", scale=1)
|
| 352 |
-
vec_scaling = gr.Number(
|
| 353 |
|
| 354 |
with gr.Column(scale=1):
|
| 355 |
output = gr.Textbox(label="Output", lines=15, max_lines=15, interactive=False)
|
|
|
|
| 349 |
with gr.Row():
|
| 350 |
layer = gr.Slider(0, 27, step=1, value=CONFIG["layer"], interactive=True, label="Steering layer", scale=2)
|
| 351 |
max_new_tokens = gr.Number(CONFIG["max_new_tokens"], minimum=10, maximum=CONFIG["max_new_tokens"], interactive=True, label="Max new tokens", scale=1)
|
| 352 |
+
vec_scaling = gr.Number(1.0, minimum=0, interactive=True, label="Vector scaling", scale=1)
|
| 353 |
|
| 354 |
with gr.Column(scale=1):
|
| 355 |
output = gr.Textbox(label="Output", lines=15, max_lines=15, interactive=False)
|
assets/vector_scaling.json
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"layer": 0,
|
| 4 |
+
"k_pos": 4.0,
|
| 5 |
+
"k_neg": 4.2
|
| 6 |
+
},
|
| 7 |
+
{
|
| 8 |
+
"layer": 1,
|
| 9 |
+
"k_pos": 5.1,
|
| 10 |
+
"k_neg": 5.4
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"layer": 2,
|
| 14 |
+
"k_pos": 8.2,
|
| 15 |
+
"k_neg": 7.9
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"layer": 3,
|
| 19 |
+
"k_pos": 12.1,
|
| 20 |
+
"k_neg": 12.3
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"layer": 4,
|
| 24 |
+
"k_pos": 13.4,
|
| 25 |
+
"k_neg": 13.9
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"layer": 5,
|
| 29 |
+
"k_pos": 13.3,
|
| 30 |
+
"k_neg": 15.1
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"layer": 6,
|
| 34 |
+
"k_pos": 16.3,
|
| 35 |
+
"k_neg": 17.2
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"layer": 7,
|
| 39 |
+
"k_pos": 20.6,
|
| 40 |
+
"k_neg": 20.9
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"layer": 8,
|
| 44 |
+
"k_pos": 28.9,
|
| 45 |
+
"k_neg": 28.3
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"layer": 9,
|
| 49 |
+
"k_pos": 41.7,
|
| 50 |
+
"k_neg": 34.4
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"layer": 10,
|
| 54 |
+
"k_pos": 43.7,
|
| 55 |
+
"k_neg": 35.1
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"layer": 11,
|
| 59 |
+
"k_pos": 44.1,
|
| 60 |
+
"k_neg": 37.1
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"layer": 12,
|
| 64 |
+
"k_pos": 48.9,
|
| 65 |
+
"k_neg": 41.1
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"layer": 13,
|
| 69 |
+
"k_pos": 52.2,
|
| 70 |
+
"k_neg": 44.4
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"layer": 14,
|
| 74 |
+
"k_pos": 56.0,
|
| 75 |
+
"k_neg": 48.3
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"layer": 15,
|
| 79 |
+
"k_pos": 59.2,
|
| 80 |
+
"k_neg": 50.1
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"layer": 16,
|
| 84 |
+
"k_pos": 63.0,
|
| 85 |
+
"k_neg": 52.6
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"layer": 17,
|
| 89 |
+
"k_pos": 68.9,
|
| 90 |
+
"k_neg": 56.2
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"layer": 18,
|
| 94 |
+
"k_pos": 76.2,
|
| 95 |
+
"k_neg": 63.4
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"layer": 19,
|
| 99 |
+
"k_pos": 85.6,
|
| 100 |
+
"k_neg": 70.3
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"layer": 20,
|
| 104 |
+
"k_pos": 96.7,
|
| 105 |
+
"k_neg": 78.9
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"layer": 21,
|
| 109 |
+
"k_pos": 117.2,
|
| 110 |
+
"k_neg": 86.8
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"layer": 22,
|
| 114 |
+
"k_pos": 135.0,
|
| 115 |
+
"k_neg": 99.8
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"layer": 23,
|
| 119 |
+
"k_pos": 151.7,
|
| 120 |
+
"k_neg": 110.1
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"layer": 24,
|
| 124 |
+
"k_pos": 172.6,
|
| 125 |
+
"k_neg": 125.6
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"layer": 25,
|
| 129 |
+
"k_pos": 193.7,
|
| 130 |
+
"k_neg": 148.4
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"layer": 26,
|
| 134 |
+
"k_pos": 217.1,
|
| 135 |
+
"k_neg": 164.0
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"layer": 27,
|
| 139 |
+
"k_pos": 238.8,
|
| 140 |
+
"k_neg": 197.9
|
| 141 |
+
}
|
| 142 |
+
]
|
schemas.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
|
|
| 1 |
from typing import Optional
|
| 2 |
from datetime import datetime, timezone
|
| 3 |
from pydantic import BaseModel, Field
|
| 4 |
from pydantic.json_schema import SkipJsonSchema
|
| 5 |
|
|
|
|
|
|
|
| 6 |
CONFIG = {
|
| 7 |
"max_new_tokens": 3048,
|
| 8 |
"top_p": 0.95,
|
| 9 |
"temperature": 0.6,
|
| 10 |
-
"k":
|
| 11 |
"layer": 25
|
| 12 |
}
|
| 13 |
|
|
@@ -22,6 +25,12 @@ class UserRequest(BaseModel):
|
|
| 22 |
k: float = Field(CONFIG["k"])
|
| 23 |
layer: int = Field(CONFIG["layer"])
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def get_api_format(self):
|
| 26 |
return {
|
| 27 |
"prompt": self.prompt,
|
|
|
|
| 1 |
+
import json
|
| 2 |
from typing import Optional
|
| 3 |
from datetime import datetime, timezone
|
| 4 |
from pydantic import BaseModel, Field
|
| 5 |
from pydantic.json_schema import SkipJsonSchema
|
| 6 |
|
| 7 |
+
vector_scaling = json.load(open("assets/vector_scaling.json", "r"))
|
| 8 |
+
|
| 9 |
CONFIG = {
|
| 10 |
"max_new_tokens": 3048,
|
| 11 |
"top_p": 0.95,
|
| 12 |
"temperature": 0.6,
|
| 13 |
+
"k": vector_scaling[25]["k_pos"],
|
| 14 |
"layer": 25
|
| 15 |
}
|
| 16 |
|
|
|
|
| 25 |
k: float = Field(CONFIG["k"])
|
| 26 |
layer: int = Field(CONFIG["layer"])
|
| 27 |
|
| 28 |
+
def model_post_init(self, __context):
|
| 29 |
+
if self.coeff < 0:
|
| 30 |
+
self.k *= vector_scaling[self.layer]["k_pos"]
|
| 31 |
+
else:
|
| 32 |
+
self.k *= vector_scaling[self.layer]["k_neg"]
|
| 33 |
+
|
| 34 |
def get_api_format(self):
|
| 35 |
return {
|
| 36 |
"prompt": self.prompt,
|