Skip to content

json_to_csv_converter

Convert the Yelp Dataset Challenge dataset from json format to csv.

For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge

parser = argparse.ArgumentParser(description='Convert Yelp Dataset Challenge data from JSON format to CSV.') module-attribute

args = parser.parse_args() module-attribute

json_file = args.json_file module-attribute

csv_file = '{0}.csv'.format(json_file.split('.json')[0]) module-attribute

column_names = get_superset_of_column_names_from_file(json_file) module-attribute

read_and_write_file(json_file_path, csv_file_path, column_names)

Read in the json dataset file and write it out to a csv file, given the column names.

Source code in src/streamsight/utils/json_to_csv_converter.py
17
18
19
20
21
22
23
24
25
def read_and_write_file(json_file_path, csv_file_path, column_names) -> None:
    """Read in the json dataset file and write it out to a csv file, given the column names."""
    with open(csv_file_path, "w") as fout:
        csv_file = csv.writer(fout)
        csv_file.writerow(list(column_names))
        with open(json_file_path, encoding="utf-8") as fin:
            for line in fin:
                line_contents = json.loads(line)
                csv_file.writerow(get_row(line_contents, column_names))

get_superset_of_column_names_from_file(json_file_path)

Read in the json dataset file and return the superset of column names.

Source code in src/streamsight/utils/json_to_csv_converter.py
28
29
30
31
32
33
34
35
def get_superset_of_column_names_from_file(json_file_path):
    """Read in the json dataset file and return the superset of column names."""
    column_names = set()
    with open(json_file_path, encoding="utf-8") as fin:
        for line in fin:
            line_contents = json.loads(line)
            column_names.update(set(get_column_names(line_contents).keys()))
    return column_names

get_column_names(line_contents, parent_key='')

Return a list of flattened key names given a dict.

Example:

line_contents = {
    'a': {
        'b': 2,
        'c': 3,
        },
}

will return: ['a.b', 'a.c']

These will be the column names for the eventual csv file.

Source code in src/streamsight/utils/json_to_csv_converter.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def get_column_names(line_contents, parent_key="") -> dict:
    """Return a list of flattened key names given a dict.

    Example:

        line_contents = {
            'a': {
                'b': 2,
                'c': 3,
                },
        }

        will return: ['a.b', 'a.c']

    These will be the column names for the eventual csv file.

    """
    column_names = []
    for k, v in line_contents.items():
        column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
        if isinstance(v, MutableMapping):
            column_names.extend(get_column_names(v, column_name).items())
        else:
            column_names.append((column_name, v))
    return dict(column_names)

get_nested_value(d, key)

Return a dictionary item given a dictionary d and a flattened key from get_column_names.

Example:

d = {
    'a': {
        'b': 2,
        'c': 3,
        },
}
key = 'a.b'

will return: 2
Source code in src/streamsight/utils/json_to_csv_converter.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def get_nested_value(d, key):
    """Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`.

    Example:

        d = {
            'a': {
                'b': 2,
                'c': 3,
                },
        }
        key = 'a.b'

        will return: 2

    """
    if "." not in key:
        if key not in d:
            return None
        return d[key]
    base_key, sub_key = key.split(".", 1)
    if base_key not in d:
        return None
    sub_dict = d[base_key]
    return get_nested_value(sub_dict, sub_key)

get_row(line_contents, column_names)

Return a csv compatible row given column names and a dict.

Source code in src/streamsight/utils/json_to_csv_converter.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def get_row(line_contents, column_names):
    """Return a csv compatible row given column names and a dict."""
    row = []
    for column_name in column_names:
        line_value = get_nested_value(
            line_contents,
            column_name,
        )
        if isinstance(line_value, str):
            row.append("{0}".format(line_value.encode("utf-8")))
        elif line_value is not None:
            row.append("{0}".format(line_value))
        else:
            row.append("")
    return row