Skip to content

Commit d904a7e

Browse files
committed
support ShareGPT dataset as data file
Signed-off-by: guangli.bao <[email protected]>
1 parent ad9513f commit d904a7e

File tree

5 files changed

+159
-0
lines changed

5 files changed

+159
-0
lines changed

contrib/sharegpt_preprocess/README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# ShareGPT Datasets
2+
3+
You can use ShareGPT_V3_unfiltered_cleaned_split.json as benchmark datasets.
4+
5+
## Example Commands
6+
7+
Download and prepare the ShareGPT dataset; You can specify the proportion of data to process by providing a number between 0 and 1 as an argument to the script.
8+
9+
```bash
10+
cd contrib/sharegpt_preprocess
11+
pip install -r requirements.txt
12+
bash prepare_sharegpt_data.sh 1
13+
14+
```
15+
16+
In this example, 1 indicates processing 100% of the dataset. You can adjust this value as needed. Conda env is Recommanded to install libs.
17+
18+
```bash
19+
guidellm benchmark \
20+
--target "http://localhost:8000" \
21+
--rate-type "throughput" \
22+
--data-args '{"prompt_column": "value", "split": "train"}' \
23+
--max-requests 10 \
24+
--data "/${local_path}/ShareGPT.json"
25+
```

contrib/sharegpt_preprocess/__init__.py

Whitespace-only changes.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/bash
2+
3+
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
4+
python3 preprocessing_sharegpt_data.py --parse $1
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import argparse
2+
import json
3+
import os
4+
import re
5+
from pathlib import Path
6+
from typing import Optional
7+
8+
import numpy as np
9+
from datasets import load_dataset
10+
from transformers import AutoTokenizer, PreTrainedTokenizerBase
11+
12+
MIN_CHAR = 10
13+
MAX_CHAR = 1000
14+
15+
16+
class TokenCounter:
17+
def __init__(self, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2"):
18+
self.model_name = model_name
19+
self._tokenizer: Optional[PreTrainedTokenizerBase] = None
20+
21+
def _initialize_tokenizer(self) -> None:
22+
if self._tokenizer is None:
23+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
24+
try:
25+
self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
26+
except (OSError, ImportError, ValueError) as e:
27+
raise RuntimeError(f"Failed to initialize tokenizer: {e}") from e
28+
29+
def estimate_num_tokens(self, text: str) -> int:
30+
self._initialize_tokenizer()
31+
32+
if self._tokenizer is None:
33+
return 0
34+
35+
try:
36+
encoding = self._tokenizer.__call__(text, return_tensors=None)
37+
return len(encoding["input_ids"])
38+
except (AttributeError, TypeError, RuntimeError) as e:
39+
raise ValueError(f"Error processing text: {e}") from e
40+
41+
42+
def extract_and_save_with_filtering(file):
43+
"""substract human prompts and apply filtering conditions"""
44+
dataset = load_dataset("json", data_files=file, split="train")
45+
filtered_prompts = []
46+
47+
for example in dataset:
48+
conversations = example.get("conversations", [])
49+
if isinstance(conversations, list):
50+
for turn in conversations:
51+
if turn.get("from") in ["human", "user"]:
52+
prompt_text = turn["value"].strip()
53+
# apply filter conditions: more than 10 characters
54+
if (
55+
len(prompt_text) >= MIN_CHAR
56+
and
57+
# less thant 1000 characters
58+
len(prompt_text) <= MAX_CHAR
59+
and
60+
# except URLs
61+
not prompt_text.startswith(("http://", "https://"))
62+
and
63+
# except special characters
64+
not re.search(r"[<>{}[\]\\]", prompt_text)
65+
and not prompt_text.isdigit()
66+
): # except pure numbers
67+
filtered_prompts.append(
68+
{
69+
"from": turn.get("from"),
70+
"text": prompt_text,
71+
"char_count": len(prompt_text),
72+
"word_count": len(prompt_text.split()),
73+
}
74+
)
75+
76+
return filtered_prompts
77+
78+
79+
if __name__ == "__main__":
80+
parser = argparse.ArgumentParser(description="Process data percentage.")
81+
parser.add_argument(
82+
"--parse",
83+
type=float,
84+
default=1,
85+
help="The percentage of data to process (0 to 1). Default is 1 (100%).",
86+
)
87+
args = parser.parse_args()
88+
89+
sharegpt_file = "ShareGPT_V3_unfiltered_cleaned_split.json"
90+
with Path(sharegpt_file).open("r", encoding="utf-8") as file:
91+
data = json.load(file)
92+
93+
counter = TokenCounter()
94+
num_of_ids = len(data)
95+
data = data[: int(num_of_ids * args.parse)]
96+
for d in data:
97+
d["num_round"] = len(d["conversations"])
98+
human_tokens = []
99+
gpt_tokens = []
100+
for conv in d["conversations"]:
101+
if conv["from"] == "human":
102+
human_tokens.append(counter.estimate_num_tokens(conv["value"]))
103+
if conv["from"] == "gpt":
104+
token_number = counter.estimate_num_tokens(conv["value"])
105+
conv["num_tokens"] = token_number
106+
gpt_tokens.append(token_number)
107+
if len(human_tokens) == 0:
108+
d["average_human_token"] = 0
109+
d["max_human_token"] = 0
110+
else:
111+
d["average_human_token"] = float(np.mean(human_tokens))
112+
d["max_human_token"] = float(np.max(human_tokens))
113+
if len(gpt_tokens) == 0:
114+
d["average_gpt_token"] = 0
115+
d["max_gpt_token"] = 0
116+
else:
117+
d["average_gpt_token"] = float(np.mean(gpt_tokens))
118+
d["max_gpt_token"] = float(np.max(gpt_tokens))
119+
120+
# save unfiletered datasets to ShareGPT.json
121+
with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
122+
json.dump(data, file, ensure_ascii=False, indent=2)
123+
# filter from: human prompts and save again
124+
filtered_result = extract_and_save_with_filtering("ShareGPT.json")
125+
with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
126+
json.dump(filtered_result, file, ensure_ascii=False, indent=2)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
tqdm
2+
pandas
3+
openai
4+
pyyaml

0 commit comments

Comments
 (0)