data_engine/fix_json_ocr.py at master · OpenJarvisAI/data_engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import json
import sys

'''
convert all ocr related data into OCR: instruction for simplicity
'''

def update_json_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    for item in data:
        image_path = item['image']

        new_convs = []
        for a in item['conversations']:
            if a['from'] == 'human':
                b = '<image>\nOCR:'
                new_convs.append({'from': 'human', 'value': b})
            elif a['from'] == 'gpt':
                new_convs.append({'from': 'gpt', 'value': a['value']})
        item['conversations'] = new_convs

    print(f'All samples: {len(data)}')
    with open(file_path, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

# Usage example
json_file_path = sys.argv[1]

update_json_file(json_file_path)