File tree Expand file tree Collapse file tree 2 files changed +2
-2
lines changed Expand file tree Collapse file tree 2 files changed +2
-2
lines changed Original file line number Diff line number Diff line change 436
436
" \" n_layers\" : 28, # Number of layers\n " ,
437
437
" \" hidden_dim\" : 3072, # Size of the intermediate dimension in FeedForward\n " ,
438
438
" \" head_dim\" : 128, # Size of the heads in GQA\n " ,
439
- " \" qk_norm\" : True, # Whether to normalize queries and values in GQA\n " ,
439
+ " \" qk_norm\" : True, # Whether to normalize queries and keys in GQA\n " ,
440
440
" \" n_kv_groups\" : 8, # Key-Value groups for grouped-query attention\n " ,
441
441
" \" rope_base\" : 1_000_000.0, # The base in RoPE's \" theta\"\n " ,
442
442
" \" dtype\" : torch.bfloat16, # Lower-precision dtype to reduce memory usage\n " ,
Original file line number Diff line number Diff line change 22
22
"n_layers" : 28 , # Number of layers
23
23
"hidden_dim" : 3072 , # Size of the intermediate dimension in FeedForward
24
24
"head_dim" : 128 , # Size of the heads in GQA
25
- "qk_norm" : True , # Whether to normalize queries and values in GQA
25
+ "qk_norm" : True , # Whether to normalize queries and keys in GQA
26
26
"n_kv_groups" : 8 , # Key-Value groups for grouped-query attention
27
27
"rope_base" : 1_000_000.0 , # The base in RoPE's "theta"
28
28
"dtype" : torch .bfloat16 , # Lower-precision dtype to reduce memory usage
You can’t perform that action at this time.
0 commit comments