-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfix_json_ocr.py
More file actions
31 lines (24 loc) · 825 Bytes
/
fix_json_ocr.py
File metadata and controls
31 lines (24 loc) · 825 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import json
import sys
'''
convert all ocr related data into OCR: instruction for simplicity
'''
def update_json_file(file_path):
with open(file_path, 'r') as f:
data = json.load(f)
for item in data:
image_path = item['image']
new_convs = []
for a in item['conversations']:
if a['from'] == 'human':
b = '<image>\nOCR:'
new_convs.append({'from': 'human', 'value': b})
elif a['from'] == 'gpt':
new_convs.append({'from': 'gpt', 'value': a['value']})
item['conversations'] = new_convs
print(f'All samples: {len(data)}')
with open(file_path, 'w') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# Usage example
json_file_path = sys.argv[1]
update_json_file(json_file_path)