@@ -19,67 +19,68 @@ const (
19
19
type TTSConfig struct {
20
20
21
21
// Voice wav path or id
22
- Voice string `yaml:"voice"`
22
+ Voice string `yaml:"voice" json:"voice" `
23
23
24
- AudioPath string `yaml:"audio_path"`
24
+ AudioPath string `yaml:"audio_path" json:"audio_path" `
25
25
}
26
26
27
+ // ModelConfig represents a model configuration
27
28
type ModelConfig struct {
28
- schema.PredictionOptions `yaml:"parameters"`
29
- Name string `yaml:"name"`
30
-
31
- F16 * bool `yaml:"f16"`
32
- Threads * int `yaml:"threads"`
33
- Debug * bool `yaml:"debug"`
34
- Roles map [string ]string `yaml:"roles"`
35
- Embeddings * bool `yaml:"embeddings"`
36
- Backend string `yaml:"backend"`
37
- TemplateConfig TemplateConfig `yaml:"template"`
38
- KnownUsecaseStrings []string `yaml:"known_usecases"`
39
- KnownUsecases * ModelConfigUsecases `yaml:"-"`
40
- Pipeline Pipeline `yaml:"pipeline"`
41
-
42
- PromptStrings , InputStrings []string `yaml:"-"`
43
- InputToken [][]int `yaml:"-"`
44
- functionCallString , functionCallNameString string `yaml:"-"`
45
- ResponseFormat string `yaml:"-"`
46
- ResponseFormatMap map [string ]interface {} `yaml:"-"`
47
-
48
- FunctionsConfig functions.FunctionsConfig `yaml:"function"`
49
-
50
- FeatureFlag FeatureFlag `yaml:"feature_flags"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
29
+ schema.PredictionOptions `yaml:"parameters" json:"parameters" `
30
+ Name string `yaml:"name" json:"name" `
31
+
32
+ F16 * bool `yaml:"f16" json:"f16" `
33
+ Threads * int `yaml:"threads" json:"threads" `
34
+ Debug * bool `yaml:"debug" json:"debug" `
35
+ Roles map [string ]string `yaml:"roles" json:"roles" `
36
+ Embeddings * bool `yaml:"embeddings" json:"embeddings" `
37
+ Backend string `yaml:"backend" json:"backend" `
38
+ TemplateConfig TemplateConfig `yaml:"template" json:"template" `
39
+ KnownUsecaseStrings []string `yaml:"known_usecases" json:"known_usecases" `
40
+ KnownUsecases * ModelConfigUsecases `yaml:"-" json:"-" `
41
+ Pipeline Pipeline `yaml:"pipeline" json:"pipeline" `
42
+
43
+ PromptStrings , InputStrings []string `yaml:"-" json:"-" `
44
+ InputToken [][]int `yaml:"-" json:"-" `
45
+ functionCallString , functionCallNameString string `yaml:"-" json:"-" `
46
+ ResponseFormat string `yaml:"-" json:"-" `
47
+ ResponseFormatMap map [string ]interface {} `yaml:"-" json:"-" `
48
+
49
+ FunctionsConfig functions.FunctionsConfig `yaml:"function" json:"function" `
50
+
51
+ FeatureFlag FeatureFlag `yaml:"feature_flags" json:"feature_flags" ` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
51
52
// LLM configs (GPT4ALL, Llama.cpp, ...)
52
- LLMConfig `yaml:",inline"`
53
+ LLMConfig `yaml:",inline" json:",inline" `
53
54
54
55
// Diffusers
55
- Diffusers Diffusers `yaml:"diffusers"`
56
- Step int `yaml:"step"`
56
+ Diffusers Diffusers `yaml:"diffusers" json:"diffusers" `
57
+ Step int `yaml:"step" json:"step" `
57
58
58
59
// GRPC Options
59
- GRPC GRPC `yaml:"grpc"`
60
+ GRPC GRPC `yaml:"grpc" json:"grpc" `
60
61
61
62
// TTS specifics
62
- TTSConfig `yaml:"tts"`
63
+ TTSConfig `yaml:"tts" json:"tts" `
63
64
64
65
// CUDA
65
66
// Explicitly enable CUDA or not (some backends might need it)
66
- CUDA bool `yaml:"cuda"`
67
+ CUDA bool `yaml:"cuda" json:"cuda" `
67
68
68
- DownloadFiles []File `yaml:"download_files"`
69
+ DownloadFiles []File `yaml:"download_files" json:"download_files" `
69
70
70
- Description string `yaml:"description"`
71
- Usage string `yaml:"usage"`
71
+ Description string `yaml:"description" json:"description" `
72
+ Usage string `yaml:"usage" json:"usage" `
72
73
73
- Options []string `yaml:"options"`
74
- Overrides []string `yaml:"overrides"`
74
+ Options []string `yaml:"options" json:"options" `
75
+ Overrides []string `yaml:"overrides" json:"overrides" `
75
76
}
76
77
77
78
// Pipeline defines other models to use for audio-to-audio
78
79
type Pipeline struct {
79
- TTS string `yaml:"tts"`
80
- LLM string `yaml:"llm"`
81
- Transcription string `yaml:"transcription"`
82
- VAD string `yaml:"vad"`
80
+ TTS string `yaml:"tts" json:"tts" `
81
+ LLM string `yaml:"llm" json:"llm" `
82
+ Transcription string `yaml:"transcription" json:"transcription" `
83
+ VAD string `yaml:"vad" json:"vad" `
83
84
}
84
85
85
86
type File struct {
@@ -91,130 +92,132 @@ type File struct {
91
92
type FeatureFlag map [string ]* bool
92
93
93
94
func (ff FeatureFlag ) Enabled (s string ) bool {
94
- v , exist := ff [s ]
95
- return exist && v != nil && * v
95
+ if v , exists := ff [s ]; exists && v != nil {
96
+ return * v
97
+ }
98
+ return false
96
99
}
97
100
98
101
type GRPC struct {
99
- Attempts int `yaml:"attempts"`
100
- AttemptsSleepTime int `yaml:"attempts_sleep_time"`
102
+ Attempts int `yaml:"attempts" json:"attempts" `
103
+ AttemptsSleepTime int `yaml:"attempts_sleep_time" json:"attempts_sleep_time" `
101
104
}
102
105
103
106
type Diffusers struct {
104
- CUDA bool `yaml:"cuda"`
105
- PipelineType string `yaml:"pipeline_type"`
106
- SchedulerType string `yaml:"scheduler_type"`
107
- EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify
108
- IMG2IMG bool `yaml:"img2img"` // Image to Image Diffuser
109
- ClipSkip int `yaml:"clip_skip"` // Skip every N frames
110
- ClipModel string `yaml:"clip_model"` // Clip model to use
111
- ClipSubFolder string `yaml:"clip_subfolder"` // Subfolder to use for clip model
112
- ControlNet string `yaml:"control_net"`
107
+ CUDA bool `yaml:"cuda" json:"cuda" `
108
+ PipelineType string `yaml:"pipeline_type" json:"pipeline_type" `
109
+ SchedulerType string `yaml:"scheduler_type" json:"scheduler_type" `
110
+ EnableParameters string `yaml:"enable_parameters" json:"enable_parameters" ` // A list of comma separated parameters to specify
111
+ IMG2IMG bool `yaml:"img2img" json:"img2img"` // Image to Image Diffuser
112
+ ClipSkip int `yaml:"clip_skip" json:"clip_skip"` // Skip every N frames
113
+ ClipModel string `yaml:"clip_model" json:"clip_model"` // Clip model to use
114
+ ClipSubFolder string `yaml:"clip_subfolder" json:"clip_subfolder"` // Subfolder to use for clip model
115
+ ControlNet string `yaml:"control_net" json:"control_net" `
113
116
}
114
117
115
118
// LLMConfig is a struct that holds the configuration that are
116
119
// generic for most of the LLM backends.
117
120
type LLMConfig struct {
118
- SystemPrompt string `yaml:"system_prompt"`
119
- TensorSplit string `yaml:"tensor_split"`
120
- MainGPU string `yaml:"main_gpu"`
121
- RMSNormEps float32 `yaml:"rms_norm_eps"`
122
- NGQA int32 `yaml:"ngqa"`
123
- PromptCachePath string `yaml:"prompt_cache_path"`
124
- PromptCacheAll bool `yaml:"prompt_cache_all"`
125
- PromptCacheRO bool `yaml:"prompt_cache_ro"`
126
- MirostatETA * float64 `yaml:"mirostat_eta"`
127
- MirostatTAU * float64 `yaml:"mirostat_tau"`
128
- Mirostat * int `yaml:"mirostat"`
129
- NGPULayers * int `yaml:"gpu_layers"`
130
- MMap * bool `yaml:"mmap"`
131
- MMlock * bool `yaml:"mmlock"`
132
- LowVRAM * bool `yaml:"low_vram"`
133
- Reranking * bool `yaml:"reranking"`
134
- Grammar string `yaml:"grammar"`
135
- StopWords []string `yaml:"stopwords"`
136
- Cutstrings []string `yaml:"cutstrings"`
137
- ExtractRegex []string `yaml:"extract_regex"`
138
- TrimSpace []string `yaml:"trimspace"`
139
- TrimSuffix []string `yaml:"trimsuffix"`
140
-
141
- ContextSize * int `yaml:"context_size"`
142
- NUMA bool `yaml:"numa"`
143
- LoraAdapter string `yaml:"lora_adapter"`
144
- LoraBase string `yaml:"lora_base"`
145
- LoraAdapters []string `yaml:"lora_adapters"`
146
- LoraScales []float32 `yaml:"lora_scales"`
147
- LoraScale float32 `yaml:"lora_scale"`
148
- NoMulMatQ bool `yaml:"no_mulmatq"`
149
- DraftModel string `yaml:"draft_model"`
150
- NDraft int32 `yaml:"n_draft"`
151
- Quantization string `yaml:"quantization"`
152
- LoadFormat string `yaml:"load_format"`
153
- GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
154
- TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
155
- EnforceEager bool `yaml:"enforce_eager"` // vLLM
156
- SwapSpace int `yaml:"swap_space"` // vLLM
157
- MaxModelLen int `yaml:"max_model_len"` // vLLM
158
- TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
159
- DisableLogStatus bool `yaml:"disable_log_stats"` // vLLM
160
- DType string `yaml:"dtype"` // vLLM
161
- LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt"` // vLLM
162
- MMProj string `yaml:"mmproj"`
163
-
164
- FlashAttention bool `yaml:"flash_attention"`
165
- NoKVOffloading bool `yaml:"no_kv_offloading"`
166
- CacheTypeK string `yaml:"cache_type_k"`
167
- CacheTypeV string `yaml:"cache_type_v"`
168
-
169
- RopeScaling string `yaml:"rope_scaling"`
170
- ModelType string `yaml:"type"`
171
-
172
- YarnExtFactor float32 `yaml:"yarn_ext_factor"`
173
- YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
174
- YarnBetaFast float32 `yaml:"yarn_beta_fast"`
175
- YarnBetaSlow float32 `yaml:"yarn_beta_slow"`
176
-
177
- CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
121
+ SystemPrompt string `yaml:"system_prompt" json:"system_prompt" `
122
+ TensorSplit string `yaml:"tensor_split" json:"tensor_split" `
123
+ MainGPU string `yaml:"main_gpu" json:"main_gpu" `
124
+ RMSNormEps float32 `yaml:"rms_norm_eps" json:"rms_norm_eps" `
125
+ NGQA int32 `yaml:"ngqa" json:"ngqa" `
126
+ PromptCachePath string `yaml:"prompt_cache_path" json:"prompt_cache_path" `
127
+ PromptCacheAll bool `yaml:"prompt_cache_all" json:"prompt_cache_all" `
128
+ PromptCacheRO bool `yaml:"prompt_cache_ro" json:"prompt_cache_ro" `
129
+ MirostatETA * float64 `yaml:"mirostat_eta" json:"mirostat_eta" `
130
+ MirostatTAU * float64 `yaml:"mirostat_tau" json:"mirostat_tau" `
131
+ Mirostat * int `yaml:"mirostat" json:"mirostat" `
132
+ NGPULayers * int `yaml:"gpu_layers" json:"gpu_layers" `
133
+ MMap * bool `yaml:"mmap" json:"mmap" `
134
+ MMlock * bool `yaml:"mmlock" json:"mmlock" `
135
+ LowVRAM * bool `yaml:"low_vram" json:"low_vram" `
136
+ Reranking * bool `yaml:"reranking" json:"reranking" `
137
+ Grammar string `yaml:"grammar" json:"grammar" `
138
+ StopWords []string `yaml:"stopwords" json:"stopwords" `
139
+ Cutstrings []string `yaml:"cutstrings" json:"cutstrings" `
140
+ ExtractRegex []string `yaml:"extract_regex" json:"extract_regex" `
141
+ TrimSpace []string `yaml:"trimspace" json:"trimspace" `
142
+ TrimSuffix []string `yaml:"trimsuffix" json:"trimsuffix" `
143
+
144
+ ContextSize * int `yaml:"context_size" json:"context_size" `
145
+ NUMA bool `yaml:"numa" json:"numa" `
146
+ LoraAdapter string `yaml:"lora_adapter" json:"lora_adapter" `
147
+ LoraBase string `yaml:"lora_base" json:"lora_base" `
148
+ LoraAdapters []string `yaml:"lora_adapters" json:"lora_adapters" `
149
+ LoraScales []float32 `yaml:"lora_scales" json:"lora_scales" `
150
+ LoraScale float32 `yaml:"lora_scale" json:"lora_scale" `
151
+ NoMulMatQ bool `yaml:"no_mulmatq" json:"no_mulmatq" `
152
+ DraftModel string `yaml:"draft_model" json:"draft_model" `
153
+ NDraft int32 `yaml:"n_draft" json:"n_draft" `
154
+ Quantization string `yaml:"quantization" json:"quantization" `
155
+ LoadFormat string `yaml:"load_format" json:"load_format" `
156
+ GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization" json:"gpu_memory_utilization" ` // vLLM
157
+ TrustRemoteCode bool `yaml:"trust_remote_code" json:"trust_remote_code"` // vLLM
158
+ EnforceEager bool `yaml:"enforce_eager" json:"enforce_eager"` // vLLM
159
+ SwapSpace int `yaml:"swap_space" json:"swap_space"` // vLLM
160
+ MaxModelLen int `yaml:"max_model_len" json:"max_model_len"` // vLLM
161
+ TensorParallelSize int `yaml:"tensor_parallel_size" json:"tensor_parallel_size"` // vLLM
162
+ DisableLogStatus bool `yaml:"disable_log_stats" json:"disable_log_stats"` // vLLM
163
+ DType string `yaml:"dtype" json:"dtype"` // vLLM
164
+ LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt" json:"limit_mm_per_prompt"` // vLLM
165
+ MMProj string `yaml:"mmproj" json:"mmproj" `
166
+
167
+ FlashAttention bool `yaml:"flash_attention" json:"flash_attention" `
168
+ NoKVOffloading bool `yaml:"no_kv_offloading" json:"no_kv_offloading" `
169
+ CacheTypeK string `yaml:"cache_type_k" json:"cache_type_k" `
170
+ CacheTypeV string `yaml:"cache_type_v" json:"cache_type_v" `
171
+
172
+ RopeScaling string `yaml:"rope_scaling" json:"rope_scaling" `
173
+ ModelType string `yaml:"type" json:"type" `
174
+
175
+ YarnExtFactor float32 `yaml:"yarn_ext_factor" json:"yarn_ext_factor" `
176
+ YarnAttnFactor float32 `yaml:"yarn_attn_factor" json:"yarn_attn_factor" `
177
+ YarnBetaFast float32 `yaml:"yarn_beta_fast" json:"yarn_beta_fast" `
178
+ YarnBetaSlow float32 `yaml:"yarn_beta_slow" json:"yarn_beta_slow" `
179
+
180
+ CFGScale float32 `yaml:"cfg_scale" json:"cfg_scale" ` // Classifier-Free Guidance Scale
178
181
}
179
182
180
183
// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
181
184
type LimitMMPerPrompt struct {
182
- LimitImagePerPrompt int `yaml:"image"`
183
- LimitVideoPerPrompt int `yaml:"video"`
184
- LimitAudioPerPrompt int `yaml:"audio"`
185
+ LimitImagePerPrompt int `yaml:"image" json:"image" `
186
+ LimitVideoPerPrompt int `yaml:"video" json:"video" `
187
+ LimitAudioPerPrompt int `yaml:"audio" json:"audio" `
185
188
}
186
189
187
190
// TemplateConfig is a struct that holds the configuration of the templating system
188
191
type TemplateConfig struct {
189
192
// Chat is the template used in the chat completion endpoint
190
- Chat string `yaml:"chat"`
193
+ Chat string `yaml:"chat" json:"chat" `
191
194
192
195
// ChatMessage is the template used for chat messages
193
- ChatMessage string `yaml:"chat_message"`
196
+ ChatMessage string `yaml:"chat_message" json:"chat_message" `
194
197
195
198
// Completion is the template used for completion requests
196
- Completion string `yaml:"completion"`
199
+ Completion string `yaml:"completion" json:"completion" `
197
200
198
201
// Edit is the template used for edit completion requests
199
- Edit string `yaml:"edit"`
202
+ Edit string `yaml:"edit" json:"edit" `
200
203
201
204
// Functions is the template used when tools are present in the client requests
202
- Functions string `yaml:"function"`
205
+ Functions string `yaml:"function" json:"function" `
203
206
204
207
// UseTokenizerTemplate is a flag that indicates if the tokenizer template should be used.
205
208
// Note: this is mostly consumed for backends such as vllm and transformers
206
209
// that can use the tokenizers specified in the JSON config files of the models
207
- UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
210
+ UseTokenizerTemplate bool `yaml:"use_tokenizer_template" json:"use_tokenizer_template" `
208
211
209
212
// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
210
213
// It defaults to \n
211
- JoinChatMessagesByCharacter * string `yaml:"join_chat_messages_by_character"`
214
+ JoinChatMessagesByCharacter * string `yaml:"join_chat_messages_by_character" json:"join_chat_messages_by_character" `
212
215
213
- Multimodal string `yaml:"multimodal"`
216
+ Multimodal string `yaml:"multimodal" json:"multimodal" `
214
217
215
- JinjaTemplate bool `yaml:"jinja_template"`
218
+ JinjaTemplate bool `yaml:"jinja_template" json:"jinja_template" `
216
219
217
- ReplyPrefix string `yaml:"reply_prefix"`
220
+ ReplyPrefix string `yaml:"reply_prefix" json:"reply_prefix" `
218
221
}
219
222
220
223
func (c * ModelConfig ) UnmarshalYAML (value * yaml.Node ) error {
0 commit comments