Source code for emlib.jsontools

"""
Set of misc. utilities to work with json

"""
from __future__ import annotations
import re


def _comments_replacer(match):
    s = match.group(0)
    return "" if s[0] == '/' else s
    


[docs]
def remove_comments(json_like: str):
    """
    Removes C-style comments from *json_like* and returns the result.

    Args:
        json_like: a fragment of valid json

    Returns:
        the json text with any comments removed
    """
    comments_re = re.compile(
        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
        re.DOTALL | re.MULTILINE
    )
    return comments_re.sub(_comments_replacer, json_like)




[docs]
def remove_trailing_commas(json_like: str):
    """
    Removes trailing commas from *json_like* and returns the result.

    Example::

        >>> remove_trailing_commas('{"foo":"bar","baz":["blah",],}')
        '{"foo":"bar","baz":["blah"]}'
    """
    trailing_object_commas_re = re.compile(
        r'(,)\s*}(?=([^"\\]*(\\.|"([^"\\]*\\.)*[^"\\]*"))*[^"]*$)')
    trailing_array_commas_re = re.compile(
        r'(,)\s*\](?=([^"\\]*(\\.|"([^"\\]*\\.)*[^"\\]*"))*[^"]*$)')
    # Fix objects {} first
    objects_fixed = trailing_object_commas_re.sub("}", json_like)
    # Now fix arrays/lists [] and return the result
    return trailing_array_commas_re.sub("]", objects_fixed)




[docs]
def remove_all(json_like: str):
    """
    Remove comments and trailing commas
    """
    pipe = [remove_comments, remove_trailing_commas]
    s = json_like
    for func in pipe:
        s = func(s)
    return s




[docs]
def json_minify(json:str, strip_space=True) -> str:
    """
    strip comments and remove space from string

    Args:
        json: a string representing a json object
        strip_space: remove spaces

    Returns:
        the minified json
    """
    tokenizer = re.compile(r'"|(/\*)|(\*/)|(//)|\n|\r')
    in_string = False
    inmulticmt = False
    insinglecmt = False
    new_str = []
    from_index = 0     # from is a keyword in Python

    for match in re.finditer(tokenizer, json):
        if not inmulticmt and not insinglecmt:
            tmp2 = json[from_index:match.start()]
            if not in_string and strip_space:
                # replace only white space defined in standard
                tmp2 = re.sub('[ \t\n\r]*', '', tmp2)
            new_str.append(tmp2)

        from_index = match.end()

        if match.group() == '"' and not (inmulticmt or insinglecmt):
            escaped = re.search('(\\\\)*$', json[:match.start()])
            if not in_string or escaped is None or len(escaped.group()) % 2 == 0:
                # start of string with ", or unescaped "
                # character found to end string
                in_string = not in_string
            from_index -= 1   # include " character in next catch
        elif match.group() == '/*' and not (in_string or inmulticmt or insinglecmt):
            inmulticmt = True
        elif match.group() == '*/' and not (in_string or inmulticmt or insinglecmt):
            inmulticmt = False
        elif match.group() == '//' and not (in_string or inmulticmt or insinglecmt):
            insinglecmt = True
        elif ((match.group() == '\n' or match.group() == '\r') and not (
                in_string or inmulticmt or insinglecmt)):
            insinglecmt = False
        elif not (inmulticmt or insinglecmt) and (
                match.group() not in ['\n', '\r', ' ', '\t'] or not strip_space):
            new_str.append(match.group())

    new_str.append(json[from_index:])
    return ''.join(new_str)