Liman commited on
Commit
71772cd
·
1 Parent(s): e960c98

Imported model from jaxmef repo

Browse files
Files changed (7) hide show
  1. README.md +101 -3
  2. config.json +25 -0
  3. onnx/model.onnx +3 -0
  4. special_tokens_map.json +37 -0
  5. tokenizer.json +0 -0
  6. tokenizer_config.json +57 -0
  7. vocab.txt +0 -0
README.md CHANGED
@@ -1,3 +1,101 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ datasets:
4
+ - liamdugan/raid
5
+ metrics:
6
+ - accuracy
7
+ - f1
8
+ - roc_auc
9
+ base_model:
10
+ - intfloat/e5-small
11
+ - MayZhou/e5-small-lora-ai-generated-detector
12
+ model-index:
13
+ - name: A Shared Benchmark for Robust Evaluation of Machine-Generated Text Detectors
14
+ results:
15
+ - task:
16
+ type: text-classification
17
+ dataset:
18
+ name: RAID-test
19
+ type: RAID-test
20
+ metrics:
21
+ - name: accuracy
22
+ type: accuracy
23
+ value: 0.939
24
+ source:
25
+ name: RAID Benchmark Leaderboard
26
+ url: https://raid-bench.xyz/leaderboard
27
+ pipeline_tag: text-classification
28
+ ---
29
+
30
+ # LoRA Fine-Tuned AI-generated Detector
31
+
32
+ > Disclaimer
33
+ >
34
+ > This ONNX model was converted from the original model available in [safetensors format](https://huggingface.co/MayZhou/e5-small-lora-ai-generated-detector). The conversion was performed to enable compatibility with frameworks or tools that utilize ONNX models.
35
+ >
36
+ > Please note that this repository is not affiliated with the creators of the original model. All credit for the model’s development belongs to the original authors. To access the original model, please visit: [Original Model Link](https://huggingface.co/MayZhou/e5-small-lora-ai-generated-detector).
37
+ >
38
+ > If you have any questions about the original model, its licensing, or usage, please refer to the source link provided above.
39
+
40
+ This is a e5-small model fine-tuned with LoRA for sequence classification tasks. It is optimized to classify text into AI-generated or human-written with high accuracy.
41
+
42
+ - **Label_0**: Represents **human-written** content.
43
+ - **Label_1**: Represents **AI-generated** content.
44
+
45
+ ## Model Details
46
+
47
+ - **Base Model**: `intfloat/e5-small`
48
+ - **Fine-Tuning Technique**: LoRA (Low-Rank Adaptation)
49
+ - **Task**: Sequence Classification
50
+ - **Use Cases**: Text classification for AI-generated detection.
51
+ - **Hyperparameters**:
52
+ - Learning rate: `5e-5`
53
+ - Epochs: `3`
54
+ - LoRA rank: `8`
55
+ - LoRA alpha: `16`
56
+
57
+ ## Training Details
58
+
59
+ - **Dataset**:
60
+ - 10,000 twitters and 10,000 rewritten twitters with GPT-4o-mini.
61
+ - 80,000 human-written text from [RAID-train](https://github.com/liamdugan/raid).
62
+ - 128,000 AI-generated text from [RAID-train](https://github.com/liamdugan/raid).
63
+ - **Hardware**: Fine-tuned on a single NVIDIA A100 GPU.
64
+ - **Training Time**: Approximately 2 hours.
65
+ - **Evaluation Metrics**:
66
+ | Metric | (Raw) E5-small | Fine-tuned |
67
+ |--------|---------------:|-----------:|
68
+ |Accuracy| 65.2% | 89.0% |
69
+ |F1 Score| 0.653 | 0.887 |
70
+ | AUC | 0.697 | 0.976 |
71
+
72
+ ## Collaborators
73
+
74
+ - **Menglin Zhou**
75
+ - **Jiaping Liu**
76
+ - **Xiaotian Zhan**
77
+
78
+ ## Citation
79
+
80
+ If you use this model, please cite the RAID dataset as follows:
81
+
82
+ ```
83
+ @inproceedings{dugan-etal-2024-raid,
84
+ title = "{RAID}: A Shared Benchmark for Robust Evaluation of Machine-Generated Text Detectors",
85
+ author = "Dugan, Liam and
86
+ Hwang, Alyssa and
87
+ Trhl{\'\i}k, Filip and
88
+ Zhu, Andrew and
89
+ Ludan, Josh Magnus and
90
+ Xu, Hainiu and
91
+ Ippolito, Daphne and
92
+ Callison-Burch, Chris",
93
+ booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
94
+ month = aug,
95
+ year = "2024",
96
+ address = "Bangkok, Thailand",
97
+ publisher = "Association for Computational Linguistics",
98
+ url = "https://aclanthology.org/2024.acl-long.674",
99
+ pages = "12463--12492",
100
+ }
101
+ ```
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "MayZhou/e5-small-lora-ai-generated-detector",
4
+ "architectures": [
5
+ "BertForSequenceClassification"
6
+ ],
7
+ "attention_probs_dropout_prob": 0.1,
8
+ "classifier_dropout": null,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 384,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1536,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.46.3",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:688d586cfae7583fa97656330144c99a113a972da8d0df1358e4c2220083c420
3
+ size 133745403
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 1000000000000000019884624838656,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "BertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff