From f25056363b7bdb801eb208506b08c219ca5f9755 Mon Sep 17 00:00:00 2001 From: Abdelrauf Date: Tue, 28 Jan 2020 20:00:12 +0400 Subject: [PATCH] auto-vectorization check for gcc (#172) * Autovectorization tool: - sync output for gnu make - Reduced html output - links for line numbers - AutoVectorization.md Signed-off-by: AbdelRauf * Detailed report with `-fsave-optimization-record` option Signed-off-by: AbdelRauf * Readme Signed-off-by: AbdelRauf Co-authored-by: raver119 --- libnd4j/CMakeLists.txt | 2 +- libnd4j/README.md | 3 + .../auto_vectorization/AutoVectorization.md | 49 ++ libnd4j/auto_vectorization/auto_vect.py | 546 ++++++++++++++++++ libnd4j/auto_vectorization/bigGzipJson.pyx | 354 ++++++++++++ libnd4j/auto_vectorization/cython_setup.py | 3 + libnd4j/blas/CMakeLists.txt | 26 + libnd4j/buildnativeoperations.sh | 21 +- 8 files changed, 1001 insertions(+), 3 deletions(-) create mode 100644 libnd4j/auto_vectorization/AutoVectorization.md create mode 100644 libnd4j/auto_vectorization/auto_vect.py create mode 100644 libnd4j/auto_vectorization/bigGzipJson.pyx create mode 100644 libnd4j/auto_vectorization/cython_setup.py diff --git a/libnd4j/CMakeLists.txt b/libnd4j/CMakeLists.txt index c82b0b217..cf9d4ff88 100755 --- a/libnd4j/CMakeLists.txt +++ b/libnd4j/CMakeLists.txt @@ -5,7 +5,7 @@ option(NATIVE "Optimize for build machine (might not work on others)" OFF) set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH}) #ensure we create lib files set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF) - +option(CHECK_VECTORIZATION "checks for vectorization" OFF) option(BUILD_TESTS "Build tests" OFF) option(FLATBUFFERS_BUILD_FLATC "Enable the build of the flatbuffers compiler" OFF) set(FLATBUFFERS_BUILD_FLATC "OFF" CACHE STRING "Hack to disable flatc build" FORCE) diff --git a/libnd4j/README.md b/libnd4j/README.md index 9cea1b597..ec17c6227 100644 --- a/libnd4j/README.md +++ b/libnd4j/README.md @@ -17,8 +17,11 @@ There's few additional arguments for `buildnativeoperations.sh` script you could -b release OR -b debug // enables/desables debug builds. release is considered by default -j XX // this argument defines how many threads will be used to binaries on your box. i.e. -j 8 -cc XX// CUDA-only argument, builds only binaries for target GPU architecture. use this for fast builds + --check-vectorization auto-vectorization report for developers. (Currently, only GCC is supported) ``` +[More about AutoVectorization report](auto_vectorization/AutoVectorization.md) + You can find the compute capability for your card [on the NVIDIA website here](https://developer.nvidia.com/cuda-gpus). For example, a GTX 1080 has compute capability 6.1, for which you would use ```-cc 61``` (note no decimal point). diff --git a/libnd4j/auto_vectorization/AutoVectorization.md b/libnd4j/auto_vectorization/AutoVectorization.md new file mode 100644 index 000000000..61b98febe --- /dev/null +++ b/libnd4j/auto_vectorization/AutoVectorization.md @@ -0,0 +1,49 @@ +# Auto-vectorization Report + +This report tool is used to get a human-friendly compiler output of the auto-vectorization process. It is intended for developers to help them to investigate the obstacles that compiler faced during auto-vectorization. + +## Usage +```--check-vectorization``` option should be added to the **release** build to be able to get the auto-vectorization report +```./buildnativeoperations.sh -a native -j 28 --check-vectorization``` +it will output ```vecmiss.html``` inside blasbuild/cpu folder. + +## Report Format +Each filename contains info about optimization attempts for the source code lines. +Each line number is also expandable (⇲) and contains distinct failure notes. +It is possible to click on the line number to see source code + +| file name | total successful attempts | total failed attempts | ⇲ | +|---|---|---|--| +| line number | successful attempts | failed attempts | ⇲ | +|- failure reasons | +| line number | successful attempts | failed attempts |⇲ | + +##### Requirements +- GCC (Currently, only GCC is supported) +- python3 + +### Detailed report with `-fsave-optimization-record` option: +If you want to get more detailed information (for now it reports the functions of failures) you should use new version of the toolchain (GCC > 9). As the new version of GCC compilers have `-fsave-optimization-record` option. +`buildnativeoperations.sh` using CMake will detect it and switch to the more detailed version. +Please, note that this option is still experimental and so the compiler can fail to output some json.gz file with error. +On that case try to exclude those files from the build. +And also the internal structure of the `-fsave-optimization-record` json.gz can be changed in future. + +It outputs two files **vecmiss_fsave.html** and **vecmiss_fsave.html.js**. So to see report details you need to enable javascript on browser if it was disabled. + +##### Requirements for the Detailed report +- GCC version > 9 +- python3 +- Cython (python3) +- json (python3) +- gzip (python3) +- c++filt + +Internally, we are using Cython to speed up json.gz file processing (bigGzipJson.pyx). Because json.gz files can take big memory in raw when loaded in whole. + +If you want to use bigGzipJson outside `buildnativeoperations.sh` and CMake then you should compile it manually using this command in auto_vectorization folder: +`python3 cython_setup.py build_ext --inplace` + +json.gz files could be processed outside of `buildnativeoperations.sh`. +You need to call `python3 auto_vect.py --fsave` inside base source folder and where json.gz files exist. + diff --git a/libnd4j/auto_vectorization/auto_vect.py b/libnd4j/auto_vectorization/auto_vect.py new file mode 100644 index 000000000..f98dc7422 --- /dev/null +++ b/libnd4j/auto_vectorization/auto_vect.py @@ -0,0 +1,546 @@ +''' +@author : Abdelrauf rauf@konduit.ai +''' +import re +import sys +import os +import subprocess +import fnmatch +import json +import gzip +try: + from bigGzipJson import json_gzip_extract_objects +except ImportError: + pass +from pathlib import Path +from multiprocessing import Pool, Manager ,cpu_count +import traceback +import html + +mtch = re.compile(r"[^/]*([^:]+)\:(\d+)\:(\d+)\:(.*)") +replace_msg = re.compile(r"(\d+)?\.?(\d+)?_?\d+\.?(\d+)?") +progress_msg = re.compile(r"\s{0,4}\[\s{0,2}\d+\%\]") +file_dir_strip = str(Path(os.getcwd())) +pp_index = file_dir_strip.rfind("libnd4j") +if pp_index>=0: + file_dir_strip =file_dir_strip[:pp_index+len("libnd4j")] +BASE_URL = "https://github.com/eclipse/deeplearning4j/tree/master/libnd4j/" +if BASE_URL.endswith("/")==False: + BASE_URL = BASE_URL + "/" +#print(file_dir_strip) +class info: + def __repr__(self): + return str(self.__dict__) + +FSAVE_IGNORE_EXTERNALS = True + +def get_cxx_filt_result(strx): + if len(strx)<1: + return "" + res = subprocess.Popen(["c++filt","-i", strx], stdout=subprocess.PIPE).communicate()[0] + res =res.decode('utf-8') + #replace some long names to reduce size + res = res.replace("unsigned long long", "uLL") + res = res.replace("unsigned long int","uL") + res = res.replace("unsigned long", "uL") + res = res.replace("unsigned int", "ui") + res = res.replace("unsigned char", "uchar") + res = res.replace("unsigned short", "ushort") + res = res.replace("long long", "LL") + res = res.replace(", ",",") + return res.strip() + + +def internal_glob(dir, match): + listx = [] + for root, dirnames, filenames in os.walk(dir): + for filename in fnmatch.filter(filenames, match): + listx.append(os.path.join(root, filename)) + return listx + +def get_obj_json_gz(filename): + with gzip.GzipFile(filename, 'r') as f: + return json.loads(f.read().decode('utf-8'))[-1] + + + +def get_msg(msg): + msg = msg.lower().strip() + if "note: not vectorized:" in msg: + msg = replace_msg.sub("_numb",msg.replace("note: not vectorized:","")) + return( 0, 1, msg.strip()) + elif "loop vectorized" in msg: + return (1, 0, None) + # elif msg.startswith("missed")==False: + # msg = replace_msg.sub("_numb",msg) + # return( 0, 0, msg.strip()) + return None + + + + +class File_Info: + ''' + Holds information about vectorized and miss vectorized lines for one file + ''' + + def __init__(self): + self.infos = {} + self.total_opted =0 + self.total_missed = 0 + self.external = False + + + def add_line(self, line_pos): + if line_pos not in self.infos: + v = info() + v.optimized = 0 + v.missed = 0 + v.miss_details = set() + self.infos[line_pos] = v + return v + else: + return self.infos[line_pos] + + + def add_line_fsave(self, line_pos): + if line_pos not in self.infos: + v = info() + v.optimized = 0 + v.missed = 0 + v.miss_details2 = dict() + self.infos[line_pos] = v + return v + else: + return self.infos[line_pos] + + + + def add_fsave(self, line_pos,success, msg, function ,inline_fns=''): + v = self.add_line_fsave(line_pos) + if success and "loop vectorized" in msg: + v.optimized +=1 + self.total_opted +=1 + elif success==False and "not vectorized:" in msg: + #reduce this msg + msg = msg.replace("not vectorized:","") + v.missed +=1 + self.total_missed +=1 + msg = sys.intern(msg) + if msg in v.miss_details2: + ls = v.miss_details2.get(msg) + ls.add(function) + else: + ls =set() + v.miss_details2[msg]=ls + ls.add(function) + return self + + def add(self, line_pos, msg_x): + v = self.add_line(line_pos) + if msg_x is not None: + v.optimized += msg_x[0] + v.missed += msg_x[1] + self.total_opted += msg_x[0] + self.total_missed += msg_x[1] + if msg_x[2] is not None: + v.miss_details.add(msg_x[2]) + return self + + + def __repr__(self): + return str(self.__dict__) + + + + +def process_gzip_json_mp(args): + process_gzip_json_new(*args) + +def process_gzip_json_new(json_gz_fname,list_Queue): + gz_name = Path(json_gz_fname).stem + #print("::--open and process {0}".format(gz_name)) + queue_count = len(list_Queue) + #print(queue_count) + q = list_Queue[0] + old_fname = '' + total_c = 0 + for x in json_gzip_extract_objects(json_gz_fname,'message','vectorized'): + external_source = True + if len(x['message'])>0 and 'location' in x: + line = int(x['location']['line']) + file_name = x['location']['file'].strip() + if file_dir_strip in file_name: + file_name = file_name.replace(file_dir_strip,'./') + external_source = False + msg = x['message'][0] + success = x['kind'] == 'success' + func = '' if 'function' not in x else x['function'] + + if file_name!=old_fname: + #send our info to the right consumer + queue_ind = hash(file_name) % queue_count + #print("quen index {0}".format(queue_ind)) + q =list_Queue[queue_ind] + old_fname = file_name + total_c +=1 + #print("pp {0} {1}".format(q,(file_name,line,success, msg, func,external_source ))) + if FSAVE_IGNORE_EXTERNALS==True and external_source == True: + continue + q.put((file_name,line,success, msg, func,external_source )) + print("::finished {0:60s} :{1:8d}".format(gz_name,total_c)) + +def consume_processed_mp(args): + return consume_processed_new(*args) + + + +def consume_processed_new(list_Queue , c_index): + + info_ = dict() + func_list = dict() + last_func_index = 0 + q = list_Queue[c_index] + print("::consumer {0}".format(c_index)) + total_c = 0 + r_c = 0 + while True: + #print("try to get new from {0}".format(index)) + obj = q.get() + #print("cc {0} {1}".format(q,obj)) + if obj==None: + break #we received the end + file_name,line,success, msg, func, external_source = obj + try: + #get function index + func_index = -1 + if func in func_list: + func_index = func_list[func] + else: + func_list[func] = last_func_index + func_index = last_func_index + last_func_index +=1 + + if file_name in info_: + info_[file_name].add_fsave(line, success, msg, func_index) + else: + info_[file_name] = File_Info().add_fsave(line, success, msg, func_index) + info_[file_name].external = external_source + total_c +=1 + if total_c - r_c >10000: + r_c = total_c + print("::consumer {0:2d} :{1:10d}".format(c_index,total_c)) + except Exception as e: + print(traceback.format_exc()) + break + + print("::consumer {0:2d} :{1:10d}".format(c_index,total_c)) + #write to temp file + wr_fname= "vecmiss_fsave{0}.html".format(str(c_index) if len(list_Queue)>1 else '') + print("generate report for consumer {0} {1}".format(c_index,len(info_))) + try: + uniq_ind = str(c_index)+'_' if len(list_Queue)>1 else '' + generate_report(wr_fname,info_ ,only_body = False, unique_id_prefix = uniq_ind,fsave_format = True, function_list= func_list) + print(" consumer {0} saved output into {1}".format(c_index,wr_fname)) + except Exception as e: + print(traceback.format_exc()) + + + +def obtain_info_from(input_): + info_ = dict() + for line in input_: + x = mtch.match(line) + external_source = True + if x: + file_name =x.group(1).strip() + if file_dir_strip in file_name: + file_name = file_name.replace(file_dir_strip,'') + external_source = False + line_number = int(x.group(2)) + msg = x.group(4).lower() + msg = msg.replace(file_dir_strip,'./') + msg_x = get_msg(msg) + if msg_x is None: + continue + if file_name in info_: + #ignore col_number + info_[file_name].add(line_number,msg_x) + else: + #print("{0} {1}".format(file_name,external_source)) + info_[file_name] = File_Info().add(line_number,msg_x) + info_[file_name].external = external_source + elif progress_msg.match(line): + #actually we redirect only, stderr so this should not happen + print("__"+line.strip()) + elif "error" in line or "Error" in line: + print("****"+line.strip()) + return info_ + + + +def custom_style(fsave): + st = '''''' + +def header(fsave=False): + strx ='\n\n\n\nAuto-Vectorization\n' + strx +=''.format(BASE_URL) + strx +=custom_style(fsave) + strx +='\n\n\n' + return strx + +def footer(): + return '\n' + + +def get_compressed_indices(set_a): + a_len = len(set_a) + if a_len<=1: + if a_len<1: + return '' + return str(set_a)[1:-1] + #we sorted and only saved difference + # 1,14,15,19 --> 1,13,1,4 10bytes=>8bytes + list_sorted = sorted(list(set_a)) + last = list_sorted[0] + str_x = str(list_sorted[0]) + for i in range(1,a_len): + str_x += ','+str(list_sorted[i]-last) + last = list_sorted[i] + return str_x + + + + + +def get_content(k, v, unique_id_prefix = '', fsave_format=False): + inner_str='' + content = '' + inc_id = 0 + for fk,fv in sorted(v.infos.items()): + if fsave_format==True: + inner_str+='
{1}
{2}
    '.format( + fk,fv.optimized,fv.missed,unique_id_prefix,inc_id) + else: + inner_str+='
    {2}
    {3}
      '.format( + k,fk,fv.optimized,fv.missed,unique_id_prefix,inc_id) + inc_id+=1 + if fsave_format==True: + # + for dt,df in fv.miss_details2.items(): + #inner_str +='
    • {1}
    • '.format(str(df).replace(", ",",")[1:-1],dt) + inner_str +='
    • {1}
    • '.format(get_compressed_indices(df),dt) + else: + for dt in fv.miss_details: + inner_str+="
    • "+str(dt)+ "
    • " + inner_str+="
    \n" + + content += '
    /g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); + } + for (i = 0; i < tags.length; i++) { + tags[i].addEventListener("click", function () { + var source = event.target || event.srcElement; + funcs = source.dataset.fns.split(",") + strx = '' + //we saved differences,not real indices + last_ind = 0; + for (j = 0; j < funcs.length; j++) { + ind = last_ind + parseInt(funcs[j]); + strx += "

    " + escapeHtml(func_list[ind]) + "

    "; + last_ind = ind; + } + if (strx.length > 0) { + content.innerHTML = strx; + modal.className = 'modal open'; + } + + }); + } + + };''' + +def additional_tags(fsave): + if fsave==False: + return '' + # + return ''' + + ''' + +def generate_report(output_name,info_ ,only_body = False, unique_id_prefix='',fsave_format = False , function_list = None): + ''' + Generate Auto-Vectorization Report in html format + ''' + + temp_str ='' + if fsave_format ==True: + # we gonna dump function_list as key list sorted by value + #and use it as jscript array + sorted_funcs_by_index = sorted(function_list.items(), key=lambda x: x[1]) + del function_list + with open(output_name+ ".js","w") as f: + #temp_str =jscript_head() +'{ "fmaps":[' + temp_str = jscript_head() + "\n var func_list = [" + for k,v in sorted_funcs_by_index: + #json.dumps using for escape + #print(str(v)+str(k)) + temp_str+=json.dumps(get_cxx_filt_result(k))+"," + #reduce write calls + if len(temp_str)>8192*2: + f.write(temp_str) + temp_str= '' + if len(temp_str)>0: + f.write(temp_str) + f.write('"-"];'+jscipt_end()) + + + temp_str = '' + with open(output_name,"w") as f: + if only_body==False: + f.write(header(fsave_format)) + f.write(additional_tags(fsave_format)) + nm=0 + for k,v in sorted(info_.items()): # sorted(info_.items(), key=lambda x: x[1].total_opted, reverse=True): + temp_str += get_content(k,v,unique_id_prefix+str(nm),fsave_format) + #reduce io write calls + if len(temp_str)>8192: + f.write(temp_str) + temp_str ='' + nm+=1 + if len(temp_str)>0: + f.write(temp_str) + if only_body==False: + f.write(footer()) + + +def fsave_report_launch(json_gz_list): + + cpus = cpu_count() + if cpus>32: + cpus = 24 + + c_count = 1 # 2 i sufficient # if cpus<=1 else min(4,cpus) + p_count = 3 if cpus<=1 else max(8, cpus - c_count) + + m = Manager() + #consumer Queues + list_Queue = [m.Queue() for index in range(0,c_count)] + with Pool(processes=c_count) as consumers: + #start consumers + cs = consumers.map_async(consume_processed_mp,[(list_Queue, index,) for index in range(0,c_count)]) + with Pool(processes=p_count) as processors: + processors.map(process_gzip_json_mp, [(fname, list_Queue,) for fname in json_gz_list]) + + #send ends to inform our consumers + #send ends + for q in list_Queue: + q.put(None) + + #wait for consumers + cs.wait() + + + + +def main(): + if "--fsave" in sys.argv: + json_gz_list = internal_glob(".","*.json.gz") + fsave_report_launch(json_gz_list) + return + + file_info = obtain_info_from(sys.stdin) + if len(file_info)>0: + #print(file_info) + print("---generating vectorization html report--") + generate_report("vecmiss.html", file_info) + else: + # lets check if we got fsave files + json_gz_list = internal_glob(".","*.json.gz") + fsave_report_launch(json_gz_list) + + + + +if __name__ == '__main__': + main() diff --git a/libnd4j/auto_vectorization/bigGzipJson.pyx b/libnd4j/auto_vectorization/bigGzipJson.pyx new file mode 100644 index 000000000..277bd16ec --- /dev/null +++ b/libnd4j/auto_vectorization/bigGzipJson.pyx @@ -0,0 +1,354 @@ +''' +@author : Abdelrauf rauf@konduit.ai +Simple object xtractor form very big json files +''' + +import sys +from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free + + +cdef char JSON_1 = b':' +cdef char JSON_2 = b',' +cdef char JSON_3 = b'{' +cdef char JSON_4 = b'}' +cdef char JSON_5 = b'[' +cdef char JSON_6 = b']' +cdef char QUOTE = b'"' +cdef char ESCAPE = b"\\" +cdef char SPACE = b' ' +cdef char TAB = b't' +cdef char CR = b'\r' +cdef char NL = b'\n' +cdef char B = b'\b' +cdef char EMPTY = b'\0' + + +cdef struct Span: + int b + int e + +cdef inline Span read_unquoted(char *text, int start,int end): + cdef Span sp + cdef int j = start + while j < end: + #if text[j].isspace(): + if text[j] == SPACE or text[j] == NL or text[j] == TAB or text[j] == CR or text[j] == B: + j += 1 + continue + if text[j] != QUOTE and text[j] != JSON_1 and text[j] != JSON_2 and text[j] != JSON_3 and text[j] != JSON_4 and text[j] != JSON_5 and text[j] != JSON_6: + start = j + j += 1 + while j < end: + # read till JSON or white space + if text[j] == SPACE or text[j] == NL or text[j] == TAB or text[j] == CR or text[j] == B: + sp.b = start + sp.e = j + return sp + elif text[j] == JSON_1 or text[j] == JSON_2 or text[j] == JSON_3 or text[j] == JSON_4 or text[j] == JSON_5 or text[j] == JSON_6: + sp.b = start + sp.e = j + return sp + j += 1 + if j == end-1: + sp.b = start + sp.e = end + return sp + break + sp.b = j + sp.e = j + return sp + + +cdef inline Span read_seq_token(char *text,int start,int end): + #read quoted + #skip white_space + cdef Span sp + cdef int j = start + cdef char last_char + cdef char char_x + while j < end: + if text[j] == SPACE or text[j] == NL or text[j] == TAB or text[j] == CR or text[j] == B: + j += 1 + continue + if text[j] == QUOTE: + last_char = EMPTY + #read till another quote + start = j + j += 1 + while j < end: + char_x = text[j] + if char_x == QUOTE and last_char != ESCAPE: + # finished reading + sp.b =start + sp.e = j+1 + return sp + last_char = char_x + j += 1 + if j == end-1: + sp.b = start + sp.e = end + return sp + else: + break + return read_unquoted(text, j, end) + + +def tokenizer_spans(utext): + ''' + we will just return tokenize spans + ''' + token_spans = [] + last_char = b'' + end_i = len(utext) + cdef char *text = utext + i = 0 + cdef Span sp + while i < end_i: + sp = read_seq_token(text, i, end_i) + i = sp.e + if sp.e > sp.b: + token_spans.append((sp.b, sp.e)) + if i < end_i: + #if text[i] in JSON: + if text[i] == JSON_3 or text[i] == JSON_4 or text[i] == JSON_5 or text[i] == JSON_6 or text[i] == JSON_1 or text[i] == JSON_2: + token_spans.append((i, i+1)) + i += 1 + return token_spans + + + + + +cdef class JsonObjXtractor: + ''' + JsonObjXtractor that utilize cython better + ''' + + cdef Span* token_spans + cdef size_t size + + def __cinit__(self, size_t count=4096): + self.token_spans = PyMem_Malloc(count * sizeof(Span)) + self.size = count + if not self.token_spans: + raise MemoryError() + + + def __tokenizer_spans(self,utext, length): + ''' + we will just return token spans length + ''' + + last_char = b'' + end_i = length + cdef char *text = utext + cdef int i = 0 + cdef size_t j = 0 + cdef Span sp + while i < end_i: + sp = read_seq_token(text, i, end_i) + i = sp.e + if sp.e > sp.b: + self.token_spans[j] = sp + j+=1 + if j>self.size: + #we need to reallocate + self.__resize(self.size+self.size//2) + if i < end_i: + #if text[i] in JSON: + if text[i] == JSON_3 or text[i] == JSON_4 or text[i] == JSON_5 or text[i] == JSON_6 or text[i] == JSON_1 or text[i] == JSON_2: + sp.b=i + sp.e=i+1 + self.token_spans[j] = sp + j+=1 + if j>self.size: + #we need to reallocate + self.__resize(self.size+self.size//2) + i += 1 + return j + + + + def try_extract_parent_obj(self, json_bytes, property_name, next_contains_value=b'', debug=False): + ''' + try_extract_parent_obj(json_text, property_name, next_contains_value='', debug=False): + make sure that passed variables encoded to bytes with encode('utf-8') + next_contains_value either direct content or followed by '[' + tries to extract the parent object for given named object + if the left brace of the parent object is outside of the current buffer + it will be ignored + if the right brace is outside of the buffer it will be left to be handled by caller + ''' + + look_for_the_left = True + parent_left = [] + parent_right = [] + parent_objects = [] + len_next = len(next_contains_value) + cdef int ind = 0 + cdef int end + cdef int last_start = 0 + property_name = b'"'+property_name+b'"' + cdef int lenx = self.__tokenizer_spans(json_bytes,len(json_bytes)) + cdef char x + cdef int i = -1 + cdef Span sp + while i < lenx-1: + i += 1 + ind = self.token_spans[i].b + x = json_bytes[ind] + #print("-----{0} -- {1} -- {2} ".format(x,parent_left,parent_right)) + if look_for_the_left == False: + if x == JSON_3: + parent_right.append(ind) + elif x == JSON_4: + if len(parent_right) == 0: + #we found parent closing brace + look_for_the_left = True + parent_objects.append((parent_left[-1], ind+1)) + last_start = ind+1 + #print("=============found {0}".format(parent_objects)) + parent_left = [] + parent_right = [] + else: + parent_right.pop() + continue + #search obj + if look_for_the_left: + if x == JSON_3: + parent_left.append(ind) + last_start = ind + elif x == JSON_4: + if len(parent_left) >= 1: + #ignore + parent_left.pop() + + if x == JSON_1: # ':' + #check to see if propertyname + old_property = EMPTY + if i > 1: + sp = self.token_spans[i-1] + old_property = json_bytes[sp.b:sp.e] + if old_property == property_name: + #we found + if len(parent_left) < 1: + #left brace is outside of the buffer + #we have to ignore it + #try to increase buffer + if debug: + print('''left brace of the parent is outside of the buffer and parent is big. + it will be ignored + try to choose disambiguous property names if you are looking for small objects''', file=sys.stderr) + last_start = ind+1 + parent_left = [] + parent_right = [] + continue + else: + #print("++++++ look for the right brace") + if len_next>0 and i+1 < lenx: + i += 1 + ind = self.token_spans[i].b + end = self.token_spans[i].e + m = json_bytes[ind] + + if m == JSON_5: + #print ("----{0} {1}".format(m,JSON_5)) + if i+1 < lenx: + i += 1 + ind = self.token_spans[i].b + end = self.token_spans[i].e + #print ("----{0} == {1}".format(next_contains_value,json_bytes[ind:end])) + if len_next <= end-ind and next_contains_value in json_bytes[ind:end]: + look_for_the_left = False + continue + elif len_next <= end-ind and next_contains_value in json_bytes[ind:end]: + look_for_the_left = False + continue + + #ignore as it does not have that value + parent_left = [] + parent_right = [] + last_start = ind + 1 + else: + look_for_the_left = False + + # lets return last succesful opened brace as the last + # or left brace failure case, safe closed brace + if len(parent_left)>0: + return (parent_objects, parent_left[-1]) + + return (parent_objects, last_start) + + + + def __resize(self, size_t new_count): + cdef Span* mem = PyMem_Realloc(self.token_spans, new_count * sizeof(Span)) + if not mem: + raise MemoryError() + self.token_spans = mem + self.size = new_count + + def __dealloc__(self): + PyMem_Free(self.token_spans) + + + +import json +import gzip +import sys +DEBUG_LOG = False + +def json_gzip_extract_objects(filename, property_name, next_contains_value=''): + strx = b'' + started= False + b_next_contains_value = next_contains_value.encode('utf-8') + b_property_name = property_name.encode('utf-8') + #print(b_property_name) + objXt = JsonObjXtractor() + with gzip.open(filename, 'rb') as f: + if DEBUG_LOG: + print("opened {0}".format(filename), file=sys.stderr) + #instead of reading it as line, I will read it as binary bytes + is_End = False + #total = 0 + while is_End==False: + buffer = f.read(8192*2) + + lenx= len(buffer) + #total +=lenx + if lenx<1: + is_End = True + else: + strx = strx + buffer + + objects , last_index = objXt.try_extract_parent_obj(strx,b_property_name,b_next_contains_value) + + # if b_property_name in strx and b_next_contains_value in strx: + # print(strx) + # print(objects) + # print(last_index) + # print("===================================================") + + for start,end in objects: + yield json.loads(strx[start:end]) #.decode('utf-8')) + + + #remove processed + if last_index< len(strx): + strx = strx[last_index:] + + else: + strx = b'' + #print('----+++') + + if(len(strx)>16384*3): + #buffer to big + #try to avoid big parents + if DEBUG_LOG: + print("parent object is too big. please, look for better property name", file=sys.stderr) + + break + + + + diff --git a/libnd4j/auto_vectorization/cython_setup.py b/libnd4j/auto_vectorization/cython_setup.py new file mode 100644 index 000000000..9dc6ef0c1 --- /dev/null +++ b/libnd4j/auto_vectorization/cython_setup.py @@ -0,0 +1,3 @@ +from distutils.core import setup +from Cython.Build import cythonize +setup(ext_modules=cythonize("bigGzipJson.pyx", language_level="3")) diff --git a/libnd4j/blas/CMakeLists.txt b/libnd4j/blas/CMakeLists.txt index c1c5de399..a54ad52b4 100755 --- a/libnd4j/blas/CMakeLists.txt +++ b/libnd4j/blas/CMakeLists.txt @@ -282,6 +282,32 @@ elseif(CPU_BLAS) set_source_files_properties(../include/helpers/impl/OpTracker.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic") endif() + if(CHECK_VECTORIZATION) + set(VECT_FILES cpu/NativeOps.cpp ${OPS_SOURCES} ${HELPERS_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES} ${LOOPS_SOURCES}) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + + if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0) + set(CHECK_VECT_FLAGS "-ftree-vectorize -fsave-optimization-record") + #to process fsave-optimization-record we will need our cython version code + message("Build Auto vectorization helpers") + execute_process(COMMAND "python3" "${CMAKE_CURRENT_SOURCE_DIR}/../auto_vectorization/cython_setup.py" "build_ext" "--inplace" WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/../auto_vectorization/" RESULT_VARIABLE ret) + message("build='${ret}'") + + #remove fail cases that gcc fails produce sometimes + file(GLOB_RECURSE FAILURE_CASES false ../include/loops/cpu/compilation_units/reduce3*.cpp) + #message("*****${FAILURE_CASES}") + foreach(FL_ITEM ${FAILURE_CASES}) + message("Removing failure cases ${FL_ITEM}") + list(REMOVE_ITEM VECT_FILES ${FL_ITEM}) + endforeach() + else() + set(CHECK_VECT_FLAGS "-ftree-vectorize -fopt-info-vec-optimized-missed") + endif() + message("CHECK VECTORIZATION ${CHECK_VECT_FLAGS}") + set_source_files_properties( ${VECT_FILES} PROPERTIES COMPILE_FLAGS "${CHECK_VECT_FLAGS}" ) + endif() + endif() + message("CPU BLAS") add_definitions(-D__CPUBLAS__=true) add_library(nd4jobj OBJECT cpu/NativeOps.cpp cpu/GraphExecutioner.cpp diff --git a/libnd4j/buildnativeoperations.sh b/libnd4j/buildnativeoperations.sh index a8b45e918..c07756d8c 100755 --- a/libnd4j/buildnativeoperations.sh +++ b/libnd4j/buildnativeoperations.sh @@ -55,6 +55,7 @@ TESTS="false" VERBOSE="false" VERBOSE_ARG="VERBOSE=1" HELPER= +CHECK_VECTORIZATION="OFF" NAME= while [[ $# > 0 ]] do @@ -114,6 +115,9 @@ case $key in NAME="$value" shift # past argument ;; + --check-vectorization) + CHECK_VECTORIZATION="ON" + ;; -j) MAKEJ="$value" shift # past argument @@ -528,14 +532,27 @@ echo MINIFIER = "${MINIFIER_ARG}" echo TESTS = "${TESTS_ARG}" echo NAME = "${NAME_ARG}" echo OPENBLAS_PATH = "$OPENBLAS_PATH" +echo CHECK_VECTORIZATION = "$CHECK_VECTORIZATION" echo HELPERS = "$HELPERS" mkbuilddir pwd -eval $CMAKE_COMMAND "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" $HELPERS "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../.. +eval $CMAKE_COMMAND "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" -DCHECK_VECTORIZATION="${CHECK_VECTORIZATION}" $HELPERS "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../.. + if [ "$PARALLEL" == "true" ]; then MAKE_ARGUMENTS="$MAKE_ARGUMENTS -j $MAKEJ" fi if [ "$VERBOSE" == "true" ]; then MAKE_ARGUMENTS="$MAKE_ARGUMENTS $VERBOSE_ARG" fi -eval $MAKE_COMMAND $MAKE_ARGUMENTS && cd ../../.. + +if [ "$CHECK_VECTORIZATION" == "ON" ]; then + +if [ "$MAKE_COMMAND" == "make" ]; then + MAKE_ARGUMENTS="$MAKE_ARGUMENTS --output-sync=target" +fi +exec 3>&1 +eval $MAKE_COMMAND $MAKE_ARGUMENTS 2>&1 >&3 3>&- | python3 ../../auto_vectorization/auto_vect.py && cd ../../.. +exec 3>&- +else +eval $MAKE_COMMAND $MAKE_ARGUMENTS && cd ../../.. +fi