|
| 1 | +# Copyright 2017-2022 John Snow Labs |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | +"""Contains classes for the Phi3Transformer.""" |
| 15 | + |
| 16 | +from sparknlp.common import * |
| 17 | + |
| 18 | + |
| 19 | +class Phi3Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): |
| 20 | + """Phi-3 |
| 21 | + |
| 22 | + The Phi-3-Medium-4K-Instruct is a 14B parameters, lightweight, state-of-the-art open model trained with the Phi-3 |
| 23 | + datasets that includes both synthetic data and the filtered publicly available websites data with a focus on |
| 24 | + high-quality and reasoning dense properties. The model belongs to the Phi-3 family with the Medium version in two |
| 25 | + variants 4K and 128K which is the context length (in tokens) that it can support. |
| 26 | +
|
| 27 | + The model has underwent a post-training process that incorporates both supervised fine-tuning and direct preference |
| 28 | + optimization for the instruction following and safety measures. When assessed against benchmarks testing common |
| 29 | + sense, language understanding, math, code, long context and logical reasoning, Phi-3-Medium-4K-Instruct showcased |
| 30 | + a robust and state-of-the-art performance among models of the same-size and next-size-up. |
| 31 | +
|
| 32 | + Pretrained models can be loaded with :meth:`.pretrained` of the companion |
| 33 | + object: |
| 34 | +
|
| 35 | + >>> phi3 = Phi3Transformer.pretrained() \\ |
| 36 | + ... .setInputCols(["document"]) \\ |
| 37 | + ... .setOutputCol("generation") |
| 38 | +
|
| 39 | +
|
| 40 | + The default model is ``"phi3"``, if no name is provided. For available |
| 41 | + pretrained models please see the `Models Hub |
| 42 | + <https://sparknlp.org/models?q=phi3>`__. |
| 43 | +
|
| 44 | + ====================== ====================== |
| 45 | + Input Annotation types Output Annotation type |
| 46 | + ====================== ====================== |
| 47 | + ``DOCUMENT`` ``DOCUMENT`` |
| 48 | + ====================== ====================== |
| 49 | +
|
| 50 | + Parameters |
| 51 | + ---------- |
| 52 | + configProtoBytes |
| 53 | + ConfigProto from tensorflow, serialized into byte array. |
| 54 | + minOutputLength |
| 55 | + Minimum length of the sequence to be generated, by default 0 |
| 56 | + maxOutputLength |
| 57 | + Maximum length of output text, by default 20 |
| 58 | + doSample |
| 59 | + Whether or not to use sampling; use greedy decoding otherwise, by default False |
| 60 | + temperature |
| 61 | + The value used to module the next token probabilities, by default 1.0 |
| 62 | + topK |
| 63 | + The number of highest probability vocabulary tokens to keep for |
| 64 | + top-k-filtering, by default 50 |
| 65 | + topP |
| 66 | + Top cumulative probability for vocabulary tokens, by default 1.0 |
| 67 | +
|
| 68 | + If set to float < 1, only the most probable tokens with probabilities |
| 69 | + that add up to ``topP`` or higher are kept for generation. |
| 70 | + repetitionPenalty |
| 71 | + The parameter for repetition penalty, 1.0 means no penalty. , by default |
| 72 | + 1.0 |
| 73 | + noRepeatNgramSize |
| 74 | + If set to int > 0, all ngrams of that size can only occur once, by |
| 75 | + default 0 |
| 76 | + ignoreTokenIds |
| 77 | + A list of token ids which are ignored in the decoder's output, by |
| 78 | + default [] |
| 79 | +
|
| 80 | + Notes |
| 81 | + ----- |
| 82 | + This is a very computationally expensive module especially on larger |
| 83 | + sequence. The use of an accelerator such as GPU is recommended. |
| 84 | +
|
| 85 | + References |
| 86 | + ---------- |
| 87 | + - `Phi-3: Small Language Models with Big Potential |
| 88 | + <https://news.microsoft.com/source/features/ai/the-phi-3-small-language-models-with-big-potential//>`__ |
| 89 | + - https://huggingface.co/microsoft/phi-3 |
| 90 | +
|
| 91 | + **Paper Abstract:** |
| 92 | +
|
| 93 | + *We introduce phi-3-mini, a 3.8 billion parameter language model trained on 3.3 trillion |
| 94 | + tokens, whose overall performance, as measured by both academic benchmarks and internal |
| 95 | + testing, rivals that of models such as Mixtral 8x7B and GPT-3.5 (e.g., phi-3-mini achieves 69% |
| 96 | + on MMLU and 8.38 on MT-bench), despite being small enough to be deployed on a phone. The |
| 97 | + innovation lies entirely in our dataset for training, a scaled-up version of the one used for |
| 98 | + phi-2, composed of heavily filtered publicly available web data and synthetic data. The model |
| 99 | + is also further aligned for robustness, safety, and chat format. We also provide some initial |
| 100 | + parameter-scaling results with a 7B and 14B models trained for 4.8T tokens, called phi-3-small |
| 101 | + and phi-3-medium, both significantly more capable than phi-3-mini (e.g., respectively 75% and |
| 102 | + 78% on MMLU, and 8.7 and 8.9 on MT-bench). Moreover, we also introduce phi-3-vision, a 4.2 |
| 103 | + billion parameter model based on phi-3-mini with strong reasoning capabilities for image and |
| 104 | + text prompts.* |
| 105 | +
|
| 106 | + Examples |
| 107 | + -------- |
| 108 | + >>> import sparknlp |
| 109 | + >>> from sparknlp.base import * |
| 110 | + >>> from sparknlp.annotator import * |
| 111 | + >>> from pyspark.ml import Pipeline |
| 112 | + >>> documentAssembler = DocumentAssembler() \\ |
| 113 | + ... .setInputCol("text") \\ |
| 114 | + ... .setOutputCol("documents") |
| 115 | + >>> phi3 = Phi3Transformer.pretrained("phi3") \\ |
| 116 | + ... .setInputCols(["documents"]) \\ |
| 117 | + ... .setMaxOutputLength(50) \\ |
| 118 | + ... .setOutputCol("generation") |
| 119 | + >>> pipeline = Pipeline().setStages([documentAssembler, phi3]) |
| 120 | + >>> data = spark.createDataFrame([["My name is Leonardo."]]).toDF("text") |
| 121 | + >>> result = pipeline.fit(data).transform(data) |
| 122 | + >>> result.select("summaries.generation").show(truncate=False) |
| 123 | + +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |
| 124 | + |result | |
| 125 | + +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |
| 126 | + |[My name is Leonardo . I am a student of the University of California, Berkeley. I am interested in the field of Artificial Intelligence and its applications in the real world. I have a strong | |
| 127 | + | passion for learning and am always looking for ways to improve my knowledge and skills] | |
| 128 | + -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |
| 129 | + """ |
| 130 | + |
| 131 | + name = "Phi3Transformer" |
| 132 | + |
| 133 | + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] |
| 134 | + |
| 135 | + outputAnnotatorType = AnnotatorType.DOCUMENT |
| 136 | + |
| 137 | + configProtoBytes = Param(Params._dummy(), "configProtoBytes", |
| 138 | + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", |
| 139 | + TypeConverters.toListInt) |
| 140 | + |
| 141 | + minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated", |
| 142 | + typeConverter=TypeConverters.toInt) |
| 143 | + |
| 144 | + maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text", |
| 145 | + typeConverter=TypeConverters.toInt) |
| 146 | + |
| 147 | + doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise", |
| 148 | + typeConverter=TypeConverters.toBoolean) |
| 149 | + |
| 150 | + temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities", |
| 151 | + typeConverter=TypeConverters.toFloat) |
| 152 | + |
| 153 | + topK = Param(Params._dummy(), "topK", |
| 154 | + "The number of highest probability vocabulary tokens to keep for top-k-filtering", |
| 155 | + typeConverter=TypeConverters.toInt) |
| 156 | + |
| 157 | + topP = Param(Params._dummy(), "topP", |
| 158 | + "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation", |
| 159 | + typeConverter=TypeConverters.toFloat) |
| 160 | + |
| 161 | + repetitionPenalty = Param(Params._dummy(), "repetitionPenalty", |
| 162 | + "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details", |
| 163 | + typeConverter=TypeConverters.toFloat) |
| 164 | + |
| 165 | + noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize", |
| 166 | + "If set to int > 0, all ngrams of that size can only occur once", |
| 167 | + typeConverter=TypeConverters.toInt) |
| 168 | + |
| 169 | + ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds", |
| 170 | + "A list of token ids which are ignored in the decoder's output", |
| 171 | + typeConverter=TypeConverters.toListInt) |
| 172 | + |
| 173 | + def setIgnoreTokenIds(self, value): |
| 174 | + """A list of token ids which are ignored in the decoder's output. |
| 175 | +
|
| 176 | + Parameters |
| 177 | + ---------- |
| 178 | + value : List[int] |
| 179 | + The words to be filtered out |
| 180 | + """ |
| 181 | + return self._set(ignoreTokenIds=value) |
| 182 | + |
| 183 | + def setConfigProtoBytes(self, b): |
| 184 | + """Sets configProto from tensorflow, serialized into byte array. |
| 185 | +
|
| 186 | + Parameters |
| 187 | + ---------- |
| 188 | + b : List[int] |
| 189 | + ConfigProto from tensorflow, serialized into byte array |
| 190 | + """ |
| 191 | + return self._set(configProtoBytes=b) |
| 192 | + |
| 193 | + def setMinOutputLength(self, value): |
| 194 | + """Sets minimum length of the sequence to be generated. |
| 195 | +
|
| 196 | + Parameters |
| 197 | + ---------- |
| 198 | + value : int |
| 199 | + Minimum length of the sequence to be generated |
| 200 | + """ |
| 201 | + return self._set(minOutputLength=value) |
| 202 | + |
| 203 | + def setMaxOutputLength(self, value): |
| 204 | + """Sets maximum length of output text. |
| 205 | +
|
| 206 | + Parameters |
| 207 | + ---------- |
| 208 | + value : int |
| 209 | + Maximum length of output text |
| 210 | + """ |
| 211 | + return self._set(maxOutputLength=value) |
| 212 | + |
| 213 | + def setDoSample(self, value): |
| 214 | + """Sets whether or not to use sampling, use greedy decoding otherwise. |
| 215 | +
|
| 216 | + Parameters |
| 217 | + ---------- |
| 218 | + value : bool |
| 219 | + Whether or not to use sampling; use greedy decoding otherwise |
| 220 | + """ |
| 221 | + return self._set(doSample=value) |
| 222 | + |
| 223 | + def setTemperature(self, value): |
| 224 | + """Sets the value used to module the next token probabilities. |
| 225 | +
|
| 226 | + Parameters |
| 227 | + ---------- |
| 228 | + value : float |
| 229 | + The value used to module the next token probabilities |
| 230 | + """ |
| 231 | + return self._set(temperature=value) |
| 232 | + |
| 233 | + def setTopK(self, value): |
| 234 | + """Sets the number of highest probability vocabulary tokens to keep for |
| 235 | + top-k-filtering. |
| 236 | +
|
| 237 | + Parameters |
| 238 | + ---------- |
| 239 | + value : int |
| 240 | + Number of highest probability vocabulary tokens to keep |
| 241 | + """ |
| 242 | + return self._set(topK=value) |
| 243 | + |
| 244 | + def setTopP(self, value): |
| 245 | + """Sets the top cumulative probability for vocabulary tokens. |
| 246 | +
|
| 247 | + If set to float < 1, only the most probable tokens with probabilities |
| 248 | + that add up to ``topP`` or higher are kept for generation. |
| 249 | +
|
| 250 | + Parameters |
| 251 | + ---------- |
| 252 | + value : float |
| 253 | + Cumulative probability for vocabulary tokens |
| 254 | + """ |
| 255 | + return self._set(topP=value) |
| 256 | + |
| 257 | + def setRepetitionPenalty(self, value): |
| 258 | + """Sets the parameter for repetition penalty. 1.0 means no penalty. |
| 259 | +
|
| 260 | + Parameters |
| 261 | + ---------- |
| 262 | + value : float |
| 263 | + The repetition penalty |
| 264 | +
|
| 265 | + References |
| 266 | + ---------- |
| 267 | + See `Ctrl: A Conditional Transformer Language Model For Controllable |
| 268 | + Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details. |
| 269 | + """ |
| 270 | + return self._set(repetitionPenalty=value) |
| 271 | + |
| 272 | + def setNoRepeatNgramSize(self, value): |
| 273 | + """Sets size of n-grams that can only occur once. |
| 274 | +
|
| 275 | + If set to int > 0, all ngrams of that size can only occur once. |
| 276 | +
|
| 277 | + Parameters |
| 278 | + ---------- |
| 279 | + value : int |
| 280 | + N-gram size can only occur once |
| 281 | + """ |
| 282 | + return self._set(noRepeatNgramSize=value) |
| 283 | + |
| 284 | + @keyword_only |
| 285 | + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.Phi3Transformer", java_model=None): |
| 286 | + super(Phi3Transformer, self).__init__(classname=classname, java_model=java_model) |
| 287 | + self._setDefault(minOutputLength=0, maxOutputLength=20, doSample=False, temperature=1.0, topK=500, topP=1.0, |
| 288 | + repetitionPenalty=1.0, noRepeatNgramSize=0, ignoreTokenIds=[], batchSize=1) |
| 289 | + |
| 290 | + @staticmethod |
| 291 | + def loadSavedModel(folder, spark_session, use_openvino=False): |
| 292 | + """Loads a locally saved model. |
| 293 | +
|
| 294 | + Parameters |
| 295 | + ---------- |
| 296 | + folder : str |
| 297 | + Folder of the saved model |
| 298 | + spark_session : pyspark.sql.SparkSession |
| 299 | + The current SparkSession |
| 300 | +
|
| 301 | + Returns |
| 302 | + ------- |
| 303 | + Phi3Transformer |
| 304 | + The restored model |
| 305 | + """ |
| 306 | + from sparknlp.internal import _Phi3Loader |
| 307 | + jModel = _Phi3Loader(folder, spark_session._jsparkSession, use_openvino)._java_obj |
| 308 | + return Phi3Transformer(java_model=jModel) |
| 309 | + |
| 310 | + @staticmethod |
| 311 | + def pretrained(name="phi3", lang="en", remote_loc=None): |
| 312 | + """Downloads and loads a pretrained model. |
| 313 | +
|
| 314 | + Parameters |
| 315 | + ---------- |
| 316 | + name : str, optional |
| 317 | + Name of the pretrained model, by default "phi3" |
| 318 | + lang : str, optional |
| 319 | + Language of the pretrained model, by default "en" |
| 320 | + remote_loc : str, optional |
| 321 | + Optional remote address of the resource, by default None. Will use |
| 322 | + Spark NLPs repositories otherwise. |
| 323 | +
|
| 324 | + Returns |
| 325 | + ------- |
| 326 | + Phi3Transformer |
| 327 | + The restored model |
| 328 | + """ |
| 329 | + from sparknlp.pretrained import ResourceDownloader |
| 330 | + return ResourceDownloader.downloadModel(Phi3Transformer, name, lang, remote_loc) |
0 commit comments