hannahcyberey commited on
Commit
a137c8f
·
1 Parent(s): 95f0c53
Files changed (3) hide show
  1. app.py +1 -1
  2. assets/vector_scaling.json +142 -0
  3. schemas.py +10 -1
app.py CHANGED
@@ -349,7 +349,7 @@ with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS,
349
  with gr.Row():
350
  layer = gr.Slider(0, 27, step=1, value=CONFIG["layer"], interactive=True, label="Steering layer", scale=2)
351
  max_new_tokens = gr.Number(CONFIG["max_new_tokens"], minimum=10, maximum=CONFIG["max_new_tokens"], interactive=True, label="Max new tokens", scale=1)
352
- vec_scaling = gr.Number(CONFIG["k"], interactive=True, label="Vector scaling", scale=1)
353
 
354
  with gr.Column(scale=1):
355
  output = gr.Textbox(label="Output", lines=15, max_lines=15, interactive=False)
 
349
  with gr.Row():
350
  layer = gr.Slider(0, 27, step=1, value=CONFIG["layer"], interactive=True, label="Steering layer", scale=2)
351
  max_new_tokens = gr.Number(CONFIG["max_new_tokens"], minimum=10, maximum=CONFIG["max_new_tokens"], interactive=True, label="Max new tokens", scale=1)
352
+ vec_scaling = gr.Number(1.0, minimum=0, interactive=True, label="Vector scaling", scale=1)
353
 
354
  with gr.Column(scale=1):
355
  output = gr.Textbox(label="Output", lines=15, max_lines=15, interactive=False)
assets/vector_scaling.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "layer": 0,
4
+ "k_pos": 4.0,
5
+ "k_neg": 4.2
6
+ },
7
+ {
8
+ "layer": 1,
9
+ "k_pos": 5.1,
10
+ "k_neg": 5.4
11
+ },
12
+ {
13
+ "layer": 2,
14
+ "k_pos": 8.2,
15
+ "k_neg": 7.9
16
+ },
17
+ {
18
+ "layer": 3,
19
+ "k_pos": 12.1,
20
+ "k_neg": 12.3
21
+ },
22
+ {
23
+ "layer": 4,
24
+ "k_pos": 13.4,
25
+ "k_neg": 13.9
26
+ },
27
+ {
28
+ "layer": 5,
29
+ "k_pos": 13.3,
30
+ "k_neg": 15.1
31
+ },
32
+ {
33
+ "layer": 6,
34
+ "k_pos": 16.3,
35
+ "k_neg": 17.2
36
+ },
37
+ {
38
+ "layer": 7,
39
+ "k_pos": 20.6,
40
+ "k_neg": 20.9
41
+ },
42
+ {
43
+ "layer": 8,
44
+ "k_pos": 28.9,
45
+ "k_neg": 28.3
46
+ },
47
+ {
48
+ "layer": 9,
49
+ "k_pos": 41.7,
50
+ "k_neg": 34.4
51
+ },
52
+ {
53
+ "layer": 10,
54
+ "k_pos": 43.7,
55
+ "k_neg": 35.1
56
+ },
57
+ {
58
+ "layer": 11,
59
+ "k_pos": 44.1,
60
+ "k_neg": 37.1
61
+ },
62
+ {
63
+ "layer": 12,
64
+ "k_pos": 48.9,
65
+ "k_neg": 41.1
66
+ },
67
+ {
68
+ "layer": 13,
69
+ "k_pos": 52.2,
70
+ "k_neg": 44.4
71
+ },
72
+ {
73
+ "layer": 14,
74
+ "k_pos": 56.0,
75
+ "k_neg": 48.3
76
+ },
77
+ {
78
+ "layer": 15,
79
+ "k_pos": 59.2,
80
+ "k_neg": 50.1
81
+ },
82
+ {
83
+ "layer": 16,
84
+ "k_pos": 63.0,
85
+ "k_neg": 52.6
86
+ },
87
+ {
88
+ "layer": 17,
89
+ "k_pos": 68.9,
90
+ "k_neg": 56.2
91
+ },
92
+ {
93
+ "layer": 18,
94
+ "k_pos": 76.2,
95
+ "k_neg": 63.4
96
+ },
97
+ {
98
+ "layer": 19,
99
+ "k_pos": 85.6,
100
+ "k_neg": 70.3
101
+ },
102
+ {
103
+ "layer": 20,
104
+ "k_pos": 96.7,
105
+ "k_neg": 78.9
106
+ },
107
+ {
108
+ "layer": 21,
109
+ "k_pos": 117.2,
110
+ "k_neg": 86.8
111
+ },
112
+ {
113
+ "layer": 22,
114
+ "k_pos": 135.0,
115
+ "k_neg": 99.8
116
+ },
117
+ {
118
+ "layer": 23,
119
+ "k_pos": 151.7,
120
+ "k_neg": 110.1
121
+ },
122
+ {
123
+ "layer": 24,
124
+ "k_pos": 172.6,
125
+ "k_neg": 125.6
126
+ },
127
+ {
128
+ "layer": 25,
129
+ "k_pos": 193.7,
130
+ "k_neg": 148.4
131
+ },
132
+ {
133
+ "layer": 26,
134
+ "k_pos": 217.1,
135
+ "k_neg": 164.0
136
+ },
137
+ {
138
+ "layer": 27,
139
+ "k_pos": 238.8,
140
+ "k_neg": 197.9
141
+ }
142
+ ]
schemas.py CHANGED
@@ -1,13 +1,16 @@
 
1
  from typing import Optional
2
  from datetime import datetime, timezone
3
  from pydantic import BaseModel, Field
4
  from pydantic.json_schema import SkipJsonSchema
5
 
 
 
6
  CONFIG = {
7
  "max_new_tokens": 3048,
8
  "top_p": 0.95,
9
  "temperature": 0.6,
10
- "k": 200,
11
  "layer": 25
12
  }
13
 
@@ -22,6 +25,12 @@ class UserRequest(BaseModel):
22
  k: float = Field(CONFIG["k"])
23
  layer: int = Field(CONFIG["layer"])
24
 
 
 
 
 
 
 
25
  def get_api_format(self):
26
  return {
27
  "prompt": self.prompt,
 
1
+ import json
2
  from typing import Optional
3
  from datetime import datetime, timezone
4
  from pydantic import BaseModel, Field
5
  from pydantic.json_schema import SkipJsonSchema
6
 
7
+ vector_scaling = json.load(open("assets/vector_scaling.json", "r"))
8
+
9
  CONFIG = {
10
  "max_new_tokens": 3048,
11
  "top_p": 0.95,
12
  "temperature": 0.6,
13
+ "k": vector_scaling[25]["k_pos"],
14
  "layer": 25
15
  }
16
 
 
25
  k: float = Field(CONFIG["k"])
26
  layer: int = Field(CONFIG["layer"])
27
 
28
+ def model_post_init(self, __context):
29
+ if self.coeff < 0:
30
+ self.k *= vector_scaling[self.layer]["k_pos"]
31
+ else:
32
+ self.k *= vector_scaling[self.layer]["k_neg"]
33
+
34
  def get_api_format(self):
35
  return {
36
  "prompt": self.prompt,