cavis/libnd4j/auto_vectorization/bigGzipJson.pyx

'''
@author : Abdelrauf rauf@konduit.ai
Simple object xtractor form very big json files
'''

import sys
from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free


cdef char JSON_1 = b':'
cdef char JSON_2 = b','
cdef char JSON_3 = b'{'
cdef char JSON_4 = b'}'
cdef char JSON_5 = b'['
cdef char JSON_6 = b']'
cdef char QUOTE = b'"'
cdef char ESCAPE = b"\\"
cdef char SPACE = b' '
cdef char TAB = b't'
cdef char CR = b'\r'
cdef char NL = b'\n'
cdef char B = b'\b'
cdef char EMPTY = b'\0'


cdef struct Span:
    int b
    int e

cdef inline Span read_unquoted(char *text, int start,int end):
    cdef Span sp
    cdef int j = start
    while j < end:
        #if text[j].isspace():
        if text[j] == SPACE or text[j] == NL or text[j] == TAB or text[j] == CR or text[j] == B:
            j += 1
            continue
        if text[j] != QUOTE and text[j] != JSON_1 and text[j] != JSON_2 and text[j] != JSON_3 and text[j] != JSON_4 and text[j] != JSON_5 and text[j] != JSON_6:
            start = j
            j += 1
            while j < end:
                # read till JSON or white space
                if text[j] == SPACE or text[j] == NL or text[j] == TAB or text[j] == CR or text[j] == B:
                    sp.b = start
                    sp.e = j
                    return sp
                elif text[j] == JSON_1 or text[j] == JSON_2 or text[j] == JSON_3 or text[j] == JSON_4 or text[j] == JSON_5 or text[j] == JSON_6:
                    sp.b = start
                    sp.e = j
                    return sp
                j += 1
            if j == end-1:
                sp.b = start
                sp.e = end
                return sp
        break
    sp.b = j
    sp.e = j
    return sp


cdef inline Span read_seq_token(char *text,int  start,int end):
    #read quoted
    #skip white_space
    cdef Span sp
    cdef int j = start
    cdef char last_char
    cdef char char_x
    while j < end:
        if text[j] == SPACE or text[j] == NL or text[j] == TAB or text[j] == CR or text[j] == B:
            j += 1
            continue
        if text[j] == QUOTE:
            last_char = EMPTY
            #read till another quote
            start = j
            j += 1
            while j < end:
                char_x = text[j]
                if char_x == QUOTE and last_char != ESCAPE:
                    # finished reading
                    sp.b =start
                    sp.e = j+1
                    return sp
                last_char = char_x
                j += 1
            if j == end-1:
                sp.b = start
                sp.e = end
                return sp
        else:
            break
    return read_unquoted(text, j, end)


def tokenizer_spans(utext):
    '''
    we will just return tokenize spans
    '''
    token_spans = []
    last_char = b''
    end_i = len(utext)
    cdef char *text = utext
    i = 0
    cdef Span sp
    while i < end_i:
        sp = read_seq_token(text, i, end_i)
        i = sp.e
        if sp.e > sp.b:
            token_spans.append((sp.b, sp.e))
        if i < end_i:
            #if text[i] in JSON:
            if text[i] == JSON_3 or text[i] == JSON_4 or text[i] == JSON_5 or text[i] == JSON_6 or text[i] == JSON_1 or text[i] == JSON_2:
                token_spans.append((i, i+1))
        i += 1
    return token_spans


cdef class JsonObjXtractor:
    '''
     JsonObjXtractor that utilize cython better
    '''

    cdef Span* token_spans
    cdef size_t size

    def __cinit__(self, size_t count=4096):
        self.token_spans = <Span*> PyMem_Malloc(count * sizeof(Span))
        self.size = count
        if not self.token_spans:
            raise MemoryError()


    def __tokenizer_spans(self,utext, length):
        '''
        we will just return token  spans length
        '''

        last_char = b''
        end_i = length
        cdef char *text = utext
        cdef int i = 0
        cdef size_t j = 0
        cdef Span sp
        while i < end_i:
            sp = read_seq_token(text, i, end_i)
            i = sp.e
            if sp.e > sp.b:
                self.token_spans[j] = sp
                j+=1
                if j>self.size:
                    #we need to reallocate
                    self.__resize(self.size+self.size//2)
            if i < end_i:
                #if text[i] in JSON:
                if text[i] == JSON_3 or text[i] == JSON_4 or text[i] == JSON_5 or text[i] == JSON_6 or text[i] == JSON_1 or text[i] == JSON_2:
                    sp.b=i
                    sp.e=i+1
                    self.token_spans[j] = sp
                    j+=1
                    if j>self.size:
                        #we need to reallocate
                        self.__resize(self.size+self.size//2)
            i += 1
        return j


    def try_extract_parent_obj(self, json_bytes, property_name, next_contains_value=b'', debug=False):
        '''
        try_extract_parent_obj(json_text, property_name, next_contains_value='', debug=False):
        make sure that passed variables encoded to bytes with encode('utf-8')
        next_contains_value either direct content or followed by '['
        tries to extract the parent object for given named object
        if the left brace of the parent object is outside of the current buffer
        it will be ignored
        if the right brace is outside of the buffer it will be left to be handled by caller
        '''

        look_for_the_left = True
        parent_left = []
        parent_right = []
        parent_objects = []
        len_next = len(next_contains_value)
        cdef int ind = 0
        cdef int end
        cdef int last_start = 0
        property_name = b'"'+property_name+b'"'
        cdef int lenx  = self.__tokenizer_spans(json_bytes,len(json_bytes))
        cdef char x
        cdef int i = -1
        cdef Span sp
        while i < lenx-1:
            i += 1
            ind = self.token_spans[i].b
            x = json_bytes[ind]
            #print("-----{0} -- {1} -- {2} ".format(x,parent_left,parent_right))
            if look_for_the_left == False:
                if x == JSON_3:
                    parent_right.append(ind)
                elif x == JSON_4:
                    if len(parent_right) == 0:
                        #we found parent closing brace
                        look_for_the_left = True
                        parent_objects.append((parent_left[-1], ind+1))
                        last_start = ind+1
                        #print("=============found {0}".format(parent_objects))
                        parent_left = []
                        parent_right = []
                    else:
                        parent_right.pop()
                continue
            #search obj
            if look_for_the_left:
                if x == JSON_3:
                    parent_left.append(ind)
                    last_start = ind
                elif x == JSON_4:
                    if len(parent_left) >= 1:
                        #ignore
                        parent_left.pop()

            if x == JSON_1:  # ':'
                #check to see if propertyname
                old_property = EMPTY
                if i > 1:
                    sp = self.token_spans[i-1]
                    old_property = json_bytes[sp.b:sp.e]
                if old_property == property_name:
                    #we found
                    if len(parent_left) < 1:
                        #left brace is outside of the buffer
                        #we have to ignore it
                        #try to increase buffer
                        if debug:
                            print('''left brace of the parent is outside of the buffer and parent is big.
                            it will be ignored
                            try to choose disambiguous property names if you are looking for small objects''', file=sys.stderr)
                        last_start = ind+1
                        parent_left = []
                        parent_right = []
                        continue
                    else:
                        #print("++++++ look for the right brace")
                        if len_next>0 and i+1 < lenx:
                                i += 1
                                ind = self.token_spans[i].b
                                end = self.token_spans[i].e
                                m = json_bytes[ind]

                                if m == JSON_5:
                                    #print ("----{0} {1}".format(m,JSON_5))
                                    if i+1 < lenx:
                                        i += 1
                                        ind = self.token_spans[i].b
                                        end = self.token_spans[i].e
                                        #print ("----{0}  == {1}".format(next_contains_value,json_bytes[ind:end]))
                                        if len_next <= end-ind and next_contains_value in json_bytes[ind:end]:
                                            look_for_the_left = False
                                            continue
                                elif len_next <= end-ind and  next_contains_value in json_bytes[ind:end]:
                                    look_for_the_left = False
                                    continue

                                #ignore as it does not have that value
                                parent_left = []
                                parent_right = []
                                last_start = ind + 1
                        else:
                            look_for_the_left = False

        # lets return last succesful opened brace as the last
        # or  left brace failure case, safe closed brace
        if len(parent_left)>0:
            return (parent_objects, parent_left[-1])

        return (parent_objects, last_start)


    def __resize(self, size_t new_count):
        cdef Span* mem = <Span*> PyMem_Realloc(self.token_spans, new_count * sizeof(Span))
        if not mem:
            raise MemoryError()
        self.token_spans = mem
        self.size = new_count

    def __dealloc__(self):
        PyMem_Free(self.token_spans)


import json
import gzip
import sys
DEBUG_LOG = False

def json_gzip_extract_objects(filename, property_name, next_contains_value=''):
    strx = b''
    started= False
    b_next_contains_value = next_contains_value.encode('utf-8')
    b_property_name = property_name.encode('utf-8')
    #print(b_property_name)
    objXt = JsonObjXtractor()
    with gzip.open(filename, 'rb') as f:
        if DEBUG_LOG:
            print("opened {0}".format(filename), file=sys.stderr)
        #instead of reading it as line, I will read it as binary bytes
        is_End = False
        #total = 0
        while is_End==False:
            buffer  = f.read(16384*2)

            lenx= len(buffer)
            #total +=lenx
            if lenx<1:
                is_End = True
            else:
                strx = strx + buffer

            objects , last_index = objXt.try_extract_parent_obj(strx,b_property_name,b_next_contains_value)

            # if b_property_name in strx and b_next_contains_value in strx:
            #     print(strx)
            #     print(objects)
            #     print(last_index)
            #     print("===================================================")

            for start,end in objects:
                yield  json.loads(strx[start:end]) #.decode('utf-8'))


            #remove processed
            if last_index< len(strx):
                strx = strx[last_index:]

            else:
                strx = b''
                #print('----+++')

            if(len(strx)>16384*4):
                #buffer to big
                #try to avoid big parents
                if DEBUG_LOG:
                    print("parent object is too big. please, look for better property name", file=sys.stderr)

                break