You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: README.md
+4-4Lines changed: 4 additions & 4 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -176,10 +176,10 @@ Optionally, you can use the following command-line flags:
176
176
|`--cai-chat`| Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
177
177
|`--cpu`| Use the CPU to generate text.|
178
178
|`--load-in-8bit`| Load the model with 8-bit precision.|
179
-
|`--load-in-4bit`| DEPRECATED: use `--gptq-bits 4` instead. |
180
-
|`--gptq-bits GPTQ_BITS`|GPTQ: Load a pre-quantized model with specified precision. 2, 3, 4 and 8 (bit) are supported. Currently only works with LLaMA and OPT. |
181
-
|`--gptq-model-type MODEL_TYPE`|GPTQ: Model type of pre-quantized model. Currently only LLaMa and OPT are supported. |
182
-
|`--gptq-pre-layer GPTQ_PRE_LAYER`| GPTQ: The number of layers to preload. |
179
+
|`--wbits WBITS`| GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. |
180
+
|`--model_type MODEL_TYPE`|GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported. |
181
+
|`--groupsize GROUPSIZE`|GPTQ: Group size. |
182
+
|`--pre_layer PRE_LAYER`| GPTQ: The number of layers to preload. |
183
183
|`--bf16`| Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
184
184
|`--auto-devices`| Automatically split the model across the available GPU(s) and CPU.|
185
185
|`--disk`| If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
'oasst-*': '<|prompter|>Write a story about future of AI development<|endoftext|><|assistant|>'
55
+
'oasst-*': '<|prompter|>Write a story about future of AI development<|endoftext|><|assistant|>',
56
+
'alpaca-*': "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\nWrite a poem about the transformers Python library. \nMention the word \"large language models\" in that poem.\n### Response:\n",
56
57
},
57
58
'lora_prompts': {
58
59
'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:',
@@ -78,10 +79,15 @@ def str2bool(v):
78
79
parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.')
79
80
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
80
81
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
81
-
parser.add_argument('--load-in-4bit', action='store_true', help='DEPRECATED: use --gptq-bits 4 instead.')
82
-
parser.add_argument('--gptq-bits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA and OPT.')
83
-
parser.add_argument('--gptq-model-type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMa and OPT are supported.')
84
-
parser.add_argument('--gptq-pre-layer', type=int, default=0, help='GPTQ: The number of layers to preload.')
82
+
83
+
parser.add_argument('--gptq-bits', type=int, default=0, help='DEPRECATED: use --wbits instead.')
84
+
parser.add_argument('--gptq-model-type', type=str, help='DEPRECATED: use --model_type instead.')
85
+
parser.add_argument('--gptq-pre-layer', type=int, default=0, help='DEPRECATED: use --pre_layer instead.')
86
+
parser.add_argument('--wbits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
87
+
parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported.')
88
+
parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.')
89
+
parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to preload.')
90
+
85
91
parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
86
92
parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
87
93
parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
@@ -109,6 +115,8 @@ def str2bool(v):
109
115
args=parser.parse_args()
110
116
111
117
# Provisional, this will be deleted later
112
-
ifargs.load_in_4bit:
113
-
print("Warning: --load-in-4bit is deprecated and will be removed. Use --gptq-bits 4 instead.\n")
0 commit comments