Skip to content

Commit 1caf296

Browse files
SparkNLP introducing Phi-3 (#14373)
* Added Phi3 with openvino and onnx support * removed changes to phi2 and changed fasttest to slowtest --------- Co-authored-by: Maziyar Panahi <[email protected]>
1 parent 9d94b9a commit 1caf296

File tree

7 files changed

+1352
-0
lines changed

7 files changed

+1352
-0
lines changed

python/sparknlp/annotator/seq2seq/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from sparknlp.annotator.seq2seq.m2m100_transformer import *
2222
from sparknlp.annotator.seq2seq.phi2_transformer import *
2323
from sparknlp.annotator.seq2seq.mistral_transformer import *
24+
from sparknlp.annotator.seq2seq.phi3_transformer import *
2425
from sparknlp.annotator.seq2seq.nllb_transformer import *
2526
from sparknlp.annotator.seq2seq.cpm_transformer import *
2627
from sparknlp.annotator.seq2seq.qwen_transformer import *
Lines changed: 330 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,330 @@
1+
# Copyright 2017-2022 John Snow Labs
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Contains classes for the Phi3Transformer."""
15+
16+
from sparknlp.common import *
17+
18+
19+
class Phi3Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
20+
"""Phi-3
21+
22+
The Phi-3-Medium-4K-Instruct is a 14B parameters, lightweight, state-of-the-art open model trained with the Phi-3
23+
datasets that includes both synthetic data and the filtered publicly available websites data with a focus on
24+
high-quality and reasoning dense properties. The model belongs to the Phi-3 family with the Medium version in two
25+
variants 4K and 128K which is the context length (in tokens) that it can support.
26+
27+
The model has underwent a post-training process that incorporates both supervised fine-tuning and direct preference
28+
optimization for the instruction following and safety measures. When assessed against benchmarks testing common
29+
sense, language understanding, math, code, long context and logical reasoning, Phi-3-Medium-4K-Instruct showcased
30+
a robust and state-of-the-art performance among models of the same-size and next-size-up.
31+
32+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
33+
object:
34+
35+
>>> phi3 = Phi3Transformer.pretrained() \\
36+
... .setInputCols(["document"]) \\
37+
... .setOutputCol("generation")
38+
39+
40+
The default model is ``"phi3"``, if no name is provided. For available
41+
pretrained models please see the `Models Hub
42+
<https://sparknlp.org/models?q=phi3>`__.
43+
44+
====================== ======================
45+
Input Annotation types Output Annotation type
46+
====================== ======================
47+
``DOCUMENT`` ``DOCUMENT``
48+
====================== ======================
49+
50+
Parameters
51+
----------
52+
configProtoBytes
53+
ConfigProto from tensorflow, serialized into byte array.
54+
minOutputLength
55+
Minimum length of the sequence to be generated, by default 0
56+
maxOutputLength
57+
Maximum length of output text, by default 20
58+
doSample
59+
Whether or not to use sampling; use greedy decoding otherwise, by default False
60+
temperature
61+
The value used to module the next token probabilities, by default 1.0
62+
topK
63+
The number of highest probability vocabulary tokens to keep for
64+
top-k-filtering, by default 50
65+
topP
66+
Top cumulative probability for vocabulary tokens, by default 1.0
67+
68+
If set to float < 1, only the most probable tokens with probabilities
69+
that add up to ``topP`` or higher are kept for generation.
70+
repetitionPenalty
71+
The parameter for repetition penalty, 1.0 means no penalty. , by default
72+
1.0
73+
noRepeatNgramSize
74+
If set to int > 0, all ngrams of that size can only occur once, by
75+
default 0
76+
ignoreTokenIds
77+
A list of token ids which are ignored in the decoder's output, by
78+
default []
79+
80+
Notes
81+
-----
82+
This is a very computationally expensive module especially on larger
83+
sequence. The use of an accelerator such as GPU is recommended.
84+
85+
References
86+
----------
87+
- `Phi-3: Small Language Models with Big Potential
88+
<https://news.microsoft.com/source/features/ai/the-phi-3-small-language-models-with-big-potential//>`__
89+
- https://huggingface.co/microsoft/phi-3
90+
91+
**Paper Abstract:**
92+
93+
*We introduce phi-3-mini, a 3.8 billion parameter language model trained on 3.3 trillion
94+
tokens, whose overall performance, as measured by both academic benchmarks and internal
95+
testing, rivals that of models such as Mixtral 8x7B and GPT-3.5 (e.g., phi-3-mini achieves 69%
96+
on MMLU and 8.38 on MT-bench), despite being small enough to be deployed on a phone. The
97+
innovation lies entirely in our dataset for training, a scaled-up version of the one used for
98+
phi-2, composed of heavily filtered publicly available web data and synthetic data. The model
99+
is also further aligned for robustness, safety, and chat format. We also provide some initial
100+
parameter-scaling results with a 7B and 14B models trained for 4.8T tokens, called phi-3-small
101+
and phi-3-medium, both significantly more capable than phi-3-mini (e.g., respectively 75% and
102+
78% on MMLU, and 8.7 and 8.9 on MT-bench). Moreover, we also introduce phi-3-vision, a 4.2
103+
billion parameter model based on phi-3-mini with strong reasoning capabilities for image and
104+
text prompts.*
105+
106+
Examples
107+
--------
108+
>>> import sparknlp
109+
>>> from sparknlp.base import *
110+
>>> from sparknlp.annotator import *
111+
>>> from pyspark.ml import Pipeline
112+
>>> documentAssembler = DocumentAssembler() \\
113+
... .setInputCol("text") \\
114+
... .setOutputCol("documents")
115+
>>> phi3 = Phi3Transformer.pretrained("phi3") \\
116+
... .setInputCols(["documents"]) \\
117+
... .setMaxOutputLength(50) \\
118+
... .setOutputCol("generation")
119+
>>> pipeline = Pipeline().setStages([documentAssembler, phi3])
120+
>>> data = spark.createDataFrame([["My name is Leonardo."]]).toDF("text")
121+
>>> result = pipeline.fit(data).transform(data)
122+
>>> result.select("summaries.generation").show(truncate=False)
123+
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
124+
|result |
125+
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
126+
|[My name is Leonardo . I am a student of the University of California, Berkeley. I am interested in the field of Artificial Intelligence and its applications in the real world. I have a strong |
127+
| passion for learning and am always looking for ways to improve my knowledge and skills] |
128+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
129+
"""
130+
131+
name = "Phi3Transformer"
132+
133+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
134+
135+
outputAnnotatorType = AnnotatorType.DOCUMENT
136+
137+
configProtoBytes = Param(Params._dummy(), "configProtoBytes",
138+
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
139+
TypeConverters.toListInt)
140+
141+
minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
142+
typeConverter=TypeConverters.toInt)
143+
144+
maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
145+
typeConverter=TypeConverters.toInt)
146+
147+
doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
148+
typeConverter=TypeConverters.toBoolean)
149+
150+
temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
151+
typeConverter=TypeConverters.toFloat)
152+
153+
topK = Param(Params._dummy(), "topK",
154+
"The number of highest probability vocabulary tokens to keep for top-k-filtering",
155+
typeConverter=TypeConverters.toInt)
156+
157+
topP = Param(Params._dummy(), "topP",
158+
"If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
159+
typeConverter=TypeConverters.toFloat)
160+
161+
repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
162+
"The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
163+
typeConverter=TypeConverters.toFloat)
164+
165+
noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
166+
"If set to int > 0, all ngrams of that size can only occur once",
167+
typeConverter=TypeConverters.toInt)
168+
169+
ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
170+
"A list of token ids which are ignored in the decoder's output",
171+
typeConverter=TypeConverters.toListInt)
172+
173+
def setIgnoreTokenIds(self, value):
174+
"""A list of token ids which are ignored in the decoder's output.
175+
176+
Parameters
177+
----------
178+
value : List[int]
179+
The words to be filtered out
180+
"""
181+
return self._set(ignoreTokenIds=value)
182+
183+
def setConfigProtoBytes(self, b):
184+
"""Sets configProto from tensorflow, serialized into byte array.
185+
186+
Parameters
187+
----------
188+
b : List[int]
189+
ConfigProto from tensorflow, serialized into byte array
190+
"""
191+
return self._set(configProtoBytes=b)
192+
193+
def setMinOutputLength(self, value):
194+
"""Sets minimum length of the sequence to be generated.
195+
196+
Parameters
197+
----------
198+
value : int
199+
Minimum length of the sequence to be generated
200+
"""
201+
return self._set(minOutputLength=value)
202+
203+
def setMaxOutputLength(self, value):
204+
"""Sets maximum length of output text.
205+
206+
Parameters
207+
----------
208+
value : int
209+
Maximum length of output text
210+
"""
211+
return self._set(maxOutputLength=value)
212+
213+
def setDoSample(self, value):
214+
"""Sets whether or not to use sampling, use greedy decoding otherwise.
215+
216+
Parameters
217+
----------
218+
value : bool
219+
Whether or not to use sampling; use greedy decoding otherwise
220+
"""
221+
return self._set(doSample=value)
222+
223+
def setTemperature(self, value):
224+
"""Sets the value used to module the next token probabilities.
225+
226+
Parameters
227+
----------
228+
value : float
229+
The value used to module the next token probabilities
230+
"""
231+
return self._set(temperature=value)
232+
233+
def setTopK(self, value):
234+
"""Sets the number of highest probability vocabulary tokens to keep for
235+
top-k-filtering.
236+
237+
Parameters
238+
----------
239+
value : int
240+
Number of highest probability vocabulary tokens to keep
241+
"""
242+
return self._set(topK=value)
243+
244+
def setTopP(self, value):
245+
"""Sets the top cumulative probability for vocabulary tokens.
246+
247+
If set to float < 1, only the most probable tokens with probabilities
248+
that add up to ``topP`` or higher are kept for generation.
249+
250+
Parameters
251+
----------
252+
value : float
253+
Cumulative probability for vocabulary tokens
254+
"""
255+
return self._set(topP=value)
256+
257+
def setRepetitionPenalty(self, value):
258+
"""Sets the parameter for repetition penalty. 1.0 means no penalty.
259+
260+
Parameters
261+
----------
262+
value : float
263+
The repetition penalty
264+
265+
References
266+
----------
267+
See `Ctrl: A Conditional Transformer Language Model For Controllable
268+
Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
269+
"""
270+
return self._set(repetitionPenalty=value)
271+
272+
def setNoRepeatNgramSize(self, value):
273+
"""Sets size of n-grams that can only occur once.
274+
275+
If set to int > 0, all ngrams of that size can only occur once.
276+
277+
Parameters
278+
----------
279+
value : int
280+
N-gram size can only occur once
281+
"""
282+
return self._set(noRepeatNgramSize=value)
283+
284+
@keyword_only
285+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.Phi3Transformer", java_model=None):
286+
super(Phi3Transformer, self).__init__(classname=classname, java_model=java_model)
287+
self._setDefault(minOutputLength=0, maxOutputLength=20, doSample=False, temperature=1.0, topK=500, topP=1.0,
288+
repetitionPenalty=1.0, noRepeatNgramSize=0, ignoreTokenIds=[], batchSize=1)
289+
290+
@staticmethod
291+
def loadSavedModel(folder, spark_session, use_openvino=False):
292+
"""Loads a locally saved model.
293+
294+
Parameters
295+
----------
296+
folder : str
297+
Folder of the saved model
298+
spark_session : pyspark.sql.SparkSession
299+
The current SparkSession
300+
301+
Returns
302+
-------
303+
Phi3Transformer
304+
The restored model
305+
"""
306+
from sparknlp.internal import _Phi3Loader
307+
jModel = _Phi3Loader(folder, spark_session._jsparkSession, use_openvino)._java_obj
308+
return Phi3Transformer(java_model=jModel)
309+
310+
@staticmethod
311+
def pretrained(name="phi3", lang="en", remote_loc=None):
312+
"""Downloads and loads a pretrained model.
313+
314+
Parameters
315+
----------
316+
name : str, optional
317+
Name of the pretrained model, by default "phi3"
318+
lang : str, optional
319+
Language of the pretrained model, by default "en"
320+
remote_loc : str, optional
321+
Optional remote address of the resource, by default None. Will use
322+
Spark NLPs repositories otherwise.
323+
324+
Returns
325+
-------
326+
Phi3Transformer
327+
The restored model
328+
"""
329+
from sparknlp.pretrained import ResourceDownloader
330+
return ResourceDownloader.downloadModel(Phi3Transformer, name, lang, remote_loc)

python/sparknlp/internal/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,14 @@ def __init__(self, path, jspark, use_openvino=False):
339339
use_openvino,
340340
)
341341

342+
class _Phi3Loader(ExtendedJavaWrapper):
343+
def __init__(self, path, jspark, use_openvino=False):
344+
super(_Phi3Loader, self).__init__(
345+
"com.johnsnowlabs.nlp.annotators.seq2seq.Phi3Transformer.loadSavedModel",
346+
path,
347+
jspark,
348+
use_openvino,
349+
)
342350

343351
class _RoBertaLoader(ExtendedJavaWrapper):
344352
def __init__(self, path, jspark, use_openvino=False):

0 commit comments

Comments
 (0)