cavis/libnd4j/auto_vectorization/bigGzipJson.pyx

355 lines
12 KiB
Cython

'''
@author : Abdelrauf rauf@konduit.ai
Simple object xtractor form very big json files
'''
import sys
from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
cdef char JSON_1 = b':'
cdef char JSON_2 = b','
cdef char JSON_3 = b'{'
cdef char JSON_4 = b'}'
cdef char JSON_5 = b'['
cdef char JSON_6 = b']'
cdef char QUOTE = b'"'
cdef char ESCAPE = b"\\"
cdef char SPACE = b' '
cdef char TAB = b't'
cdef char CR = b'\r'
cdef char NL = b'\n'
cdef char B = b'\b'
cdef char EMPTY = b'\0'
cdef struct Span:
int b
int e
cdef inline Span read_unquoted(char *text, int start,int end):
cdef Span sp
cdef int j = start
while j < end:
#if text[j].isspace():
if text[j] == SPACE or text[j] == NL or text[j] == TAB or text[j] == CR or text[j] == B:
j += 1
continue
if text[j] != QUOTE and text[j] != JSON_1 and text[j] != JSON_2 and text[j] != JSON_3 and text[j] != JSON_4 and text[j] != JSON_5 and text[j] != JSON_6:
start = j
j += 1
while j < end:
# read till JSON or white space
if text[j] == SPACE or text[j] == NL or text[j] == TAB or text[j] == CR or text[j] == B:
sp.b = start
sp.e = j
return sp
elif text[j] == JSON_1 or text[j] == JSON_2 or text[j] == JSON_3 or text[j] == JSON_4 or text[j] == JSON_5 or text[j] == JSON_6:
sp.b = start
sp.e = j
return sp
j += 1
if j == end-1:
sp.b = start
sp.e = end
return sp
break
sp.b = j
sp.e = j
return sp
cdef inline Span read_seq_token(char *text,int start,int end):
#read quoted
#skip white_space
cdef Span sp
cdef int j = start
cdef char last_char
cdef char char_x
while j < end:
if text[j] == SPACE or text[j] == NL or text[j] == TAB or text[j] == CR or text[j] == B:
j += 1
continue
if text[j] == QUOTE:
last_char = EMPTY
#read till another quote
start = j
j += 1
while j < end:
char_x = text[j]
if char_x == QUOTE and last_char != ESCAPE:
# finished reading
sp.b =start
sp.e = j+1
return sp
last_char = char_x
j += 1
if j == end-1:
sp.b = start
sp.e = end
return sp
else:
break
return read_unquoted(text, j, end)
def tokenizer_spans(utext):
'''
we will just return tokenize spans
'''
token_spans = []
last_char = b''
end_i = len(utext)
cdef char *text = utext
i = 0
cdef Span sp
while i < end_i:
sp = read_seq_token(text, i, end_i)
i = sp.e
if sp.e > sp.b:
token_spans.append((sp.b, sp.e))
if i < end_i:
#if text[i] in JSON:
if text[i] == JSON_3 or text[i] == JSON_4 or text[i] == JSON_5 or text[i] == JSON_6 or text[i] == JSON_1 or text[i] == JSON_2:
token_spans.append((i, i+1))
i += 1
return token_spans
cdef class JsonObjXtractor:
'''
JsonObjXtractor that utilize cython better
'''
cdef Span* token_spans
cdef size_t size
def __cinit__(self, size_t count=4096):
self.token_spans = <Span*> PyMem_Malloc(count * sizeof(Span))
self.size = count
if not self.token_spans:
raise MemoryError()
def __tokenizer_spans(self,utext, length):
'''
we will just return token spans length
'''
last_char = b''
end_i = length
cdef char *text = utext
cdef int i = 0
cdef size_t j = 0
cdef Span sp
while i < end_i:
sp = read_seq_token(text, i, end_i)
i = sp.e
if sp.e > sp.b:
self.token_spans[j] = sp
j+=1
if j>self.size:
#we need to reallocate
self.__resize(self.size+self.size//2)
if i < end_i:
#if text[i] in JSON:
if text[i] == JSON_3 or text[i] == JSON_4 or text[i] == JSON_5 or text[i] == JSON_6 or text[i] == JSON_1 or text[i] == JSON_2:
sp.b=i
sp.e=i+1
self.token_spans[j] = sp
j+=1
if j>self.size:
#we need to reallocate
self.__resize(self.size+self.size//2)
i += 1
return j
def try_extract_parent_obj(self, json_bytes, property_name, next_contains_value=b'', debug=False):
'''
try_extract_parent_obj(json_text, property_name, next_contains_value='', debug=False):
make sure that passed variables encoded to bytes with encode('utf-8')
next_contains_value either direct content or followed by '['
tries to extract the parent object for given named object
if the left brace of the parent object is outside of the current buffer
it will be ignored
if the right brace is outside of the buffer it will be left to be handled by caller
'''
look_for_the_left = True
parent_left = []
parent_right = []
parent_objects = []
len_next = len(next_contains_value)
cdef int ind = 0
cdef int end
cdef int last_start = 0
property_name = b'"'+property_name+b'"'
cdef int lenx = self.__tokenizer_spans(json_bytes,len(json_bytes))
cdef char x
cdef int i = -1
cdef Span sp
while i < lenx-1:
i += 1
ind = self.token_spans[i].b
x = json_bytes[ind]
#print("-----{0} -- {1} -- {2} ".format(x,parent_left,parent_right))
if look_for_the_left == False:
if x == JSON_3:
parent_right.append(ind)
elif x == JSON_4:
if len(parent_right) == 0:
#we found parent closing brace
look_for_the_left = True
parent_objects.append((parent_left[-1], ind+1))
last_start = ind+1
#print("=============found {0}".format(parent_objects))
parent_left = []
parent_right = []
else:
parent_right.pop()
continue
#search obj
if look_for_the_left:
if x == JSON_3:
parent_left.append(ind)
last_start = ind
elif x == JSON_4:
if len(parent_left) >= 1:
#ignore
parent_left.pop()
if x == JSON_1: # ':'
#check to see if propertyname
old_property = EMPTY
if i > 1:
sp = self.token_spans[i-1]
old_property = json_bytes[sp.b:sp.e]
if old_property == property_name:
#we found
if len(parent_left) < 1:
#left brace is outside of the buffer
#we have to ignore it
#try to increase buffer
if debug:
print('''left brace of the parent is outside of the buffer and parent is big.
it will be ignored
try to choose disambiguous property names if you are looking for small objects''', file=sys.stderr)
last_start = ind+1
parent_left = []
parent_right = []
continue
else:
#print("++++++ look for the right brace")
if len_next>0 and i+1 < lenx:
i += 1
ind = self.token_spans[i].b
end = self.token_spans[i].e
m = json_bytes[ind]
if m == JSON_5:
#print ("----{0} {1}".format(m,JSON_5))
if i+1 < lenx:
i += 1
ind = self.token_spans[i].b
end = self.token_spans[i].e
#print ("----{0} == {1}".format(next_contains_value,json_bytes[ind:end]))
if len_next <= end-ind and next_contains_value in json_bytes[ind:end]:
look_for_the_left = False
continue
elif len_next <= end-ind and next_contains_value in json_bytes[ind:end]:
look_for_the_left = False
continue
#ignore as it does not have that value
parent_left = []
parent_right = []
last_start = ind + 1
else:
look_for_the_left = False
# lets return last succesful opened brace as the last
# or left brace failure case, safe closed brace
if len(parent_left)>0:
return (parent_objects, parent_left[-1])
return (parent_objects, last_start)
def __resize(self, size_t new_count):
cdef Span* mem = <Span*> PyMem_Realloc(self.token_spans, new_count * sizeof(Span))
if not mem:
raise MemoryError()
self.token_spans = mem
self.size = new_count
def __dealloc__(self):
PyMem_Free(self.token_spans)
import json
import gzip
import sys
DEBUG_LOG = False
def json_gzip_extract_objects(filename, property_name, next_contains_value=''):
strx = b''
started= False
b_next_contains_value = next_contains_value.encode('utf-8')
b_property_name = property_name.encode('utf-8')
#print(b_property_name)
objXt = JsonObjXtractor()
with gzip.open(filename, 'rb') as f:
if DEBUG_LOG:
print("opened {0}".format(filename), file=sys.stderr)
#instead of reading it as line, I will read it as binary bytes
is_End = False
#total = 0
while is_End==False:
buffer = f.read(16384*2)
lenx= len(buffer)
#total +=lenx
if lenx<1:
is_End = True
else:
strx = strx + buffer
objects , last_index = objXt.try_extract_parent_obj(strx,b_property_name,b_next_contains_value)
# if b_property_name in strx and b_next_contains_value in strx:
# print(strx)
# print(objects)
# print(last_index)
# print("===================================================")
for start,end in objects:
yield json.loads(strx[start:end]) #.decode('utf-8'))
#remove processed
if last_index< len(strx):
strx = strx[last_index:]
else:
strx = b''
#print('----+++')
if(len(strx)>16384*4):
#buffer to big
#try to avoid big parents
if DEBUG_LOG:
print("parent object is too big. please, look for better property name", file=sys.stderr)
break