forked from martin-wey/peft-llm-code
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_utils.py
More file actions
112 lines (92 loc) · 3.18 KB
/
data_utils.py
File metadata and controls
112 lines (92 loc) · 3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import json
from datasets import load_dataset, DatasetDict
def transform_conala(output_dir="datasets"):
dataset = load_dataset("neulab/docprompting-conala")
def process_example(e):
messages = [
{
"role": "user",
"content": e["nl"]
},
{
"role": "assistant",
"content": e["cmd"]
}
]
return {"messages": messages}
dataset = dataset.map(process_example, num_proc=8)
dataset.save_to_disk(f"{output_dir}/conala")
def transform_code_alpaca(output_dir="datasets"):
dataset = load_dataset("HuggingFaceH4/CodeAlpaca_20K")
def process_example(e):
messages = [
{
"role": "user",
"content": e["prompt"]
},
{
"role": "assistant",
"content": e["completion"]
}
]
return {"messages": messages}
# create validation set
train_set = dataset["train"].shuffle(42)
validation_set = train_set.select(range(500))
train_set = train_set.select(range(500, len(train_set)))
dataset = DatasetDict({
"train": train_set,
"validation": validation_set,
"test": dataset["test"]
})
dataset = dataset.map(process_example, num_proc=8)
dataset.save_to_disk(f"{output_dir}/codealpaca")
def transform_apps(output_dir="datasets"):
# this preprocessing follows the same format used in the original APPs paper:
# https://github.com/hendrycks/apps/blob/main/train/dataset_apps/APPSBaseDataset.py
# https://huggingface.co/spaces/codeparrot/apps_metric/blob/main/example_script.py
dataset = load_dataset("codeparrot/apps", trust_remote_code=True)
def process_example(e):
starter_code = None if len(e["starter_code"]) == 0 else e["starter_code"]
try:
input_outpout = json.loads(e["input_output"])
fn_name = None if not input_outpout.get("fn_name") else input_outpout["fn_name"]
except ValueError:
fn_name = None
try:
solutions = json.loads(e["solutions"])
except ValueError:
solutions = [""]
_input = e["question"]
if starter_code:
_input += starter_code
if fn_name:
_input += "\nUse Standard Input format\n"
else:
_input += "\nUse Call-Based format\n"
messages = [
{
"role": "user",
"content": _input
},
{
"role": "assistant",
"content": solutions[0]
}
]
return {"messages": messages}
# create validation set
train_set = dataset["train"].shuffle(42)
validation_set = train_set.select(range(500))
train_set = train_set.select(range(500, len(train_set)))
dataset = DatasetDict({
"train": train_set,
"validation": validation_set,
"test": dataset["test"]
})
dataset = dataset.map(process_example, num_proc=8)
dataset.save_to_disk(f"{output_dir}/apps")
if __name__ == "__main__":
# transform_conala()
# transform_code_alpaca()
transform_apps()