auto-vectorization check for gcc (#172)

* Autovectorization tool: - sync output for gnu make - Reduced html output - links for line numbers - AutoVectorization.md Signed-off-by: AbdelRauf <rauf@konduit.ai> * Detailed report with `-fsave-optimization-record` option Signed-off-by: AbdelRauf <rauf@konduit.ai> * Readme Signed-off-by: AbdelRauf <rauf@konduit.ai> Co-authored-by: raver119 <raver119@gmail.com>
2020-01-28 20:00:12 +04:00 · 2020-01-28 20:00:12 +04:00 · f25056363b
commit f25056363b
parent 7a7ee4b021
8 changed files with 1001 additions and 3 deletions
--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@ -5,7 +5,7 @@ option(NATIVE "Optimize for build machine (might not work on others)" OFF)
 set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
 #ensure we create lib files
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF)
-
+option(CHECK_VECTORIZATION "checks for vectorization" OFF)
 option(BUILD_TESTS "Build tests" OFF)
 option(FLATBUFFERS_BUILD_FLATC "Enable the build of the flatbuffers compiler" OFF)
 set(FLATBUFFERS_BUILD_FLATC "OFF" CACHE STRING "Hack to disable flatc build" FORCE)
--- a/libnd4j/README.md
+++ b/libnd4j/README.md
@ -17,8 +17,11 @@ There's few additional arguments for `buildnativeoperations.sh` script you could
 -b release OR -b debug // enables/desables debug builds. release is considered by default
 -j XX // this argument defines how many threads will be used to binaries on your box. i.e. -j 8 
 -cc XX// CUDA-only argument, builds only binaries for target GPU architecture. use this for fast builds
+ --check-vectorization  auto-vectorization report for developers. (Currently, only GCC is supported)
 ```

+[More about AutoVectorization report](auto_vectorization/AutoVectorization.md)  
+
 You can find the compute capability for your card [on the NVIDIA website here](https://developer.nvidia.com/cuda-gpus).

 For example, a GTX 1080 has compute capability 6.1, for which you would use ```-cc 61``` (note no decimal point).
--- a/libnd4j/auto_vectorization/AutoVectorization.md
+++ b/libnd4j/auto_vectorization/AutoVectorization.md
@ -0,0 +1,49 @@
+# Auto-vectorization Report
+
+This report tool is used to get a human-friendly compiler output of the auto-vectorization process. It is intended for developers to help them to investigate the obstacles that compiler faced during auto-vectorization.
+
+## Usage
+```--check-vectorization``` option should be added to the **release** build to be able to get the auto-vectorization report 
+```./buildnativeoperations.sh -a native  -j 28 --check-vectorization```
+it will output ```vecmiss.html``` inside  blasbuild/cpu folder.
+
+## Report Format 
+Each filename contains info about optimization attempts for the source code lines. 
+Each line number is also expandable (⇲) and contains distinct failure notes. 
+It is possible to click on the line number to see source code  
+
+| file name  | total successful attempts | total failed attempts | ⇲  |
+|---|---|---|--|
+| line number  | successful attempts  | failed attempts  | ⇲ |
+|- failure reasons  |
+| line number  | successful attempts  | failed attempts  |⇲ |
+
+##### Requirements 
+- GCC (Currently, only GCC is supported)
+- python3
+
+### Detailed report with `-fsave-optimization-record` option:
+If you want to get more detailed information (for now it reports the functions of failures)  you should use new version of the toolchain (GCC > 9). As the new version of GCC compilers have `-fsave-optimization-record` option.   
+`buildnativeoperations.sh` using CMake will detect it and switch to the more detailed version.
+Please, note that this option is still experimental and so the compiler can fail to output some json.gz file with error.
+On that case try to exclude those files from the build.  
+And also the internal structure of the `-fsave-optimization-record` json.gz  can be changed in future. 
+
+It outputs two files **vecmiss_fsave.html** and **vecmiss_fsave.html.js**. So to see report details you need to enable javascript on browser if it was disabled.
+
+##### Requirements for the Detailed report
+- GCC version > 9
+- python3 
+- Cython (python3)
+- json   (python3)
+- gzip   (python3) 
+- c++filt
+
+Internally, we are using Cython to speed up json.gz file processing (bigGzipJson.pyx). Because json.gz files can take big memory in raw when loaded in whole.
+
+If you want to use bigGzipJson outside  `buildnativeoperations.sh` and CMake then you should compile it manually using this command in auto_vectorization folder:
+`python3 cython_setup.py build_ext --inplace`  
+
+json.gz files could be processed outside of `buildnativeoperations.sh`.    
+You need to call `python3 auto_vect.py --fsave` inside base source folder and where json.gz files exist.  
+
--- a/libnd4j/auto_vectorization/auto_vect.py
+++ b/libnd4j/auto_vectorization/auto_vect.py
@ -0,0 +1,546 @@
+'''
+@author : Abdelrauf rauf@konduit.ai
+'''
+import re
+import sys
+import os
+import subprocess
+import fnmatch
+import json
+import gzip
+try:
+    from bigGzipJson import json_gzip_extract_objects
+except ImportError:
+    pass
+from pathlib import Path
+from multiprocessing import  Pool, Manager ,cpu_count
+import traceback
+import html
+
+mtch = re.compile(r"[^/]*([^:]+)\:(\d+)\:(\d+)\:(.*)")
+replace_msg = re.compile(r"(\d+)?\.?(\d+)?_?\d+\.?(\d+)?")
+progress_msg = re.compile(r"\s{0,4}\[\s{0,2}\d+\%\]")
+file_dir_strip = str(Path(os.getcwd()))
+pp_index = file_dir_strip.rfind("libnd4j")
+if pp_index>=0:
+    file_dir_strip =file_dir_strip[:pp_index+len("libnd4j")]
+BASE_URL = "https://github.com/eclipse/deeplearning4j/tree/master/libnd4j/"
+if BASE_URL.endswith("/")==False:
+    BASE_URL = BASE_URL + "/"
+#print(file_dir_strip)
+class info:
+    def __repr__(self):
+        return str(self.__dict__) 
+
+FSAVE_IGNORE_EXTERNALS = True
+
+def get_cxx_filt_result(strx):
+    if len(strx)<1:
+        return ""
+    res = subprocess.Popen(["c++filt","-i", strx], stdout=subprocess.PIPE).communicate()[0]
+    res =res.decode('utf-8')
+    #replace some long names to reduce size
+    res = res.replace("unsigned long long", "uLL")
+    res = res.replace("unsigned long int","uL")
+    res = res.replace("unsigned long", "uL")
+    res = res.replace("unsigned int", "ui")
+    res = res.replace("unsigned char", "uchar")
+    res = res.replace("unsigned short", "ushort")    
+    res = res.replace("long long", "LL")
+    res = res.replace(", ",",")
+    return res.strip()
+
+ 
+def internal_glob(dir, match):
+    listx = []
+    for root, dirnames, filenames in os.walk(dir):
+        for filename in fnmatch.filter(filenames, match):
+            listx.append(os.path.join(root, filename))
+    return listx
+
+def get_obj_json_gz(filename):
+    with gzip.GzipFile(filename, 'r') as f:
+        return json.loads(f.read().decode('utf-8'))[-1]
+
+
+
+def get_msg(msg):
+    msg = msg.lower().strip() 
+    if "note: not vectorized:" in msg:
+        msg = replace_msg.sub("_numb",msg.replace("note: not vectorized:",""))
+        return( 0, 1, msg.strip()) 
+    elif "loop vectorized" in msg: 
+        return (1, 0, None)
+    # elif msg.startswith("missed")==False:
+    #     msg = replace_msg.sub("_numb",msg)
+    #     return( 0, 0, msg.strip())         
+    return None
+
+
+ 
+
+class File_Info:
+    '''
+    Holds information about vectorized and miss vectorized lines for one file
+    '''
+    
+    def __init__(self):
+        self.infos = {}
+        self.total_opted =0
+        self.total_missed = 0
+        self.external = False
+
+
+    def add_line(self, line_pos):
+        if line_pos not in self.infos:
+            v = info()
+            v.optimized = 0
+            v.missed = 0
+            v.miss_details = set()
+            self.infos[line_pos] = v
+            return v
+        else:
+            return self.infos[line_pos]
+
+
+    def add_line_fsave(self, line_pos):
+        if line_pos not in self.infos:
+            v = info()
+            v.optimized = 0
+            v.missed = 0
+            v.miss_details2 = dict()
+            self.infos[line_pos] = v
+            return v
+        else:
+            return self.infos[line_pos]            
+               
+
+
+    def add_fsave(self, line_pos,success, msg, function ,inline_fns=''):
+        v = self.add_line_fsave(line_pos)
+        if success and "loop vectorized" in msg:
+            v.optimized +=1
+            self.total_opted +=1
+        elif success==False and "not vectorized:" in msg:
+            #reduce this msg
+            msg = msg.replace("not vectorized:","")
+            v.missed +=1
+            self.total_missed +=1
+            msg = sys.intern(msg)
+            if msg in v.miss_details2:
+                ls = v.miss_details2.get(msg) 
+                ls.add(function)
+            else:
+                ls =set()
+                v.miss_details2[msg]=ls
+                ls.add(function)
+        return self
+
+    def add(self, line_pos, msg_x):
+        v = self.add_line(line_pos)
+        if msg_x is not None:
+                v.optimized += msg_x[0]
+                v.missed += msg_x[1]
+                self.total_opted += msg_x[0]
+                self.total_missed += msg_x[1]
+                if msg_x[2] is not None:
+                    v.miss_details.add(msg_x[2])
+        return self
+    
+
+    def __repr__(self):
+        return str(self.__dict__)                    
+                    
+
+
+
+def process_gzip_json_mp(args):
+    process_gzip_json_new(*args)
+
+def process_gzip_json_new(json_gz_fname,list_Queue):
+    gz_name = Path(json_gz_fname).stem
+    #print("::--open and process {0}".format(gz_name))
+    queue_count = len(list_Queue)
+    #print(queue_count)
+    q = list_Queue[0]
+    old_fname = ''
+    total_c = 0
+    for x in json_gzip_extract_objects(json_gz_fname,'message','vectorized'):
+        external_source = True
+        if len(x['message'])>0 and 'location' in x:
+            line = int(x['location']['line'])
+            file_name = x['location']['file'].strip()
+            if  file_dir_strip in file_name:
+                file_name = file_name.replace(file_dir_strip,'./')
+                external_source = False
+            msg = x['message'][0]
+            success = x['kind'] == 'success'
+            func = ''  if 'function' not in x else x['function']
+
+            if file_name!=old_fname:
+                #send our info to the right consumer
+                queue_ind = hash(file_name) % queue_count
+                #print("quen index {0}".format(queue_ind))
+                q =list_Queue[queue_ind]
+                old_fname = file_name
+            total_c +=1
+            #print("pp {0} {1}".format(q,(file_name,line,success, msg, func,external_source )))
+            if FSAVE_IGNORE_EXTERNALS==True and external_source == True:
+                continue
+            q.put((file_name,line,success, msg, func,external_source ))
+    print("::finished {0:60s} :{1:8d}".format(gz_name,total_c))
+
+def consume_processed_mp(args):
+    return consume_processed_new(*args)
+
+
+
+def consume_processed_new(list_Queue , c_index):
+    
+    info_ = dict()
+    func_list = dict()
+    last_func_index = 0 
+    q  = list_Queue[c_index]
+    print("::consumer {0}".format(c_index))
+    total_c = 0
+    r_c = 0
+    while True:
+        #print("try to get new from {0}".format(index))
+        obj = q.get()
+        #print("cc {0} {1}".format(q,obj))
+        if obj==None:
+            break #we received the end
+        file_name,line,success, msg, func, external_source = obj
+        try:
+            #get function index
+            func_index = -1
+            if func in func_list:
+                func_index = func_list[func]
+            else:
+                func_list[func] = last_func_index
+                func_index = last_func_index
+                last_func_index +=1
+ 
+            if file_name in info_: 
+                info_[file_name].add_fsave(line, success, msg, func_index)
+            else: 
+                info_[file_name] = File_Info().add_fsave(line, success, msg, func_index)
+                info_[file_name].external = external_source
+            total_c +=1 
+            if total_c - r_c >10000:
+                r_c = total_c
+                print("::consumer {0:2d} :{1:10d}".format(c_index,total_c))
+        except Exception as e:
+            print(traceback.format_exc())
+            break
+
+    print("::consumer {0:2d} :{1:10d}".format(c_index,total_c))
+    #write to temp file 
+    wr_fname= "vecmiss_fsave{0}.html".format(str(c_index) if len(list_Queue)>1 else '')
+    print("generate report for consumer {0} {1}".format(c_index,len(info_)))
+    try:
+        uniq_ind = str(c_index)+'_' if len(list_Queue)>1  else ''
+        generate_report(wr_fname,info_ ,only_body = False, unique_id_prefix = uniq_ind,fsave_format = True, function_list= func_list)
+        print(" consumer {0} saved output into {1}".format(c_index,wr_fname))
+    except Exception as e:
+        print(traceback.format_exc())
+
+            
+
+def obtain_info_from(input_):
+    info_ = dict()
+    for line in input_:
+        x = mtch.match(line)
+        external_source = True
+        if x:
+            file_name =x.group(1).strip()
+            if  file_dir_strip in file_name:
+                file_name = file_name.replace(file_dir_strip,'')
+                external_source = False
+            line_number = int(x.group(2))
+            msg = x.group(4).lower()
+            msg = msg.replace(file_dir_strip,'./')
+            msg_x = get_msg(msg)
+            if msg_x is None:
+                continue
+            if file_name in info_:
+                #ignore col_number
+                info_[file_name].add(line_number,msg_x)
+            else:
+                #print("{0} {1}".format(file_name,external_source))
+                info_[file_name] = File_Info().add(line_number,msg_x)
+                info_[file_name].external = external_source
+        elif progress_msg.match(line):
+            #actually we redirect only, stderr so this should not happen
+            print("__"+line.strip())
+        elif "error" in line or "Error" in line:
+            print("****"+line.strip())
+    return info_
+
+
+           
+def custom_style(fsave):
+    st = '''<style>a{color:blue;}
+a:link{text-decoration:none}a:visited{text-decoration:none}a:hover{cursor:pointer;text-decoration:underline}
+a:active{text-decoration:underline}
+.f.ext{display:none} 
+.f{color:#000;display:flex;overflow:hidden;justify-content:space-between;flex-wrap:wrap;align-items:baseline;width:100%}
+.f>div{min-width:10%}.f>div:first-child{min-width:70%;text-overflow:ellipsis}
+.f:nth-of-type(even){background-color:#f5f5f5}
+.f>div.g{flex:0 0 100%}.f>div:nth-child(2){font-weight:600;color:green}
+.f>div:nth-child(3){font-weight:600;color:red}
+.f>div:nth-child(2)::after{content:' ✓';color:green}.f>div:nth-child(3)::after{content:' -';color:red}
+.f>div.g>div>div:nth-child(2){font-weight:600;color:green}
+.f>div.g>div>div:nth-child(3){font-weight:600;color:red}
+.f>div.g>div>div:nth-child(2)::after{content:' ✓';color:green}
+.f>div.g>div>div:nth-child(3)::after{content:' -';color:red}
+.f>div.g>div{display:flex;justify-content:space-between;flex-wrap:wrap;align-items:baseline}
+.f>div.g>div>div{min-width:10%;text-align:left}
+.g>div:nth-of-type(even){background-color:#ede6fa}
+.f>div.g>div>ul{flex:0 0 100%}input[type=checkbox]{opacity:0;display:none}label{cursor:pointer}
+.f>label{color:red}input[type=checkbox]~.g{display:none}input[type=checkbox]:checked~.g{display:block}
+input[type=checkbox]~ul{display:none}
+input[type=checkbox]:checked~ul{display:block}input[type=checkbox]+label::after{content:"⇲";display:block}
+input[type=checkbox]:checked+label::after{content:"⇱";display:block}
+
+'''
+    if fsave==True:
+        st+='''.modal{display:none;height:100%;background-color:#144F84;color:#fff;opacity:.93;left:0;position:fixed;top:0;width:100%}
+        .modal.open{display:flex;flex-direction:column}.modal__header{height:auto;font-size:large;padding:10px;background-color:#000;color:#fff}
+        .modal__footer{height:auto;font-size:medium;background-color:#000}
+        .modal__content{height:100%;display:flex;flex-direction:column;padding:20px;overflow-y:auto}
+        .modal_close{cursor:pointer;float:right}li{cursor:pointer}
+        '''
+    return st + '''</style>'''
+
+def header(fsave=False):
+    strx ='<!DOCTYPE html>\n<html>\n<head>\n<meta charset="UTF-8">\n<title>Auto-Vectorization</title>\n'
+    strx +='<base id="base_id" href="{0}" target="_blank" >'.format(BASE_URL)
+    strx +=custom_style(fsave)  
+    strx +='\n</head>\n<body>\n'
+    return strx
+
+def footer():
+    return '\n</body></html>'
+
+
+def get_compressed_indices(set_a):
+    a_len = len(set_a)
+    if a_len<=1:
+        if a_len<1:
+            return ''
+        return str(set_a)[1:-1]
+    #we sorted and only saved difference 
+    # 1,14,15,19 --> 1,13,1,4  10bytes=>8bytes
+    list_sorted = sorted(list(set_a))
+    last = list_sorted[0]
+    str_x = str(list_sorted[0])
+    for i in range(1,a_len):
+        str_x += ','+str(list_sorted[i]-last)
+        last = list_sorted[i]
+    return str_x
+
+
+    
+
+
+def get_content(k, v,  unique_id_prefix = '', fsave_format=False):
+    inner_str=''
+    content = ''
+    inc_id = 0
+    for fk,fv in sorted(v.infos.items()):
+        if fsave_format==True:
+            inner_str+='<div><div><a>{0}</a></div><div>{1}</div><div>{2}</div><input type="checkbox" id="c{3}{4}"><label for="c{3}{4}"></label><ul>'.format(
+            fk,fv.optimized,fv.missed,unique_id_prefix,inc_id)
+        else:    
+            inner_str+='<div><div><a href=".{0}#L{1}">{1}</a></div><div>{2}</div><div>{3}</div><input type="checkbox" id="c{4}{5}"><label for="c{4}{5}"></label><ul>'.format(
+            k,fk,fv.optimized,fv.missed,unique_id_prefix,inc_id)
+        inc_id+=1
+        if fsave_format==True:
+            #
+            for dt,df in fv.miss_details2.items():
+                #inner_str  +='<li data-fns="{0}">{1}</li>'.format(str(df).replace(", ",",")[1:-1],dt) 
+                inner_str  +='<li data-fns="{0}">{1}</li>'.format(get_compressed_indices(df),dt)           
+        else:
+            for dt in fv.miss_details:
+                inner_str+="<li>"+str(dt)+ "</li>"
+        inner_str+="</ul></div>\n"
+
+    content += '<div class="f'
+    if v.external:
+        content += " ext"
+    content += '">\n<div>{0}</div><div>{1}</div><div>{2}</div><input type="checkbox" id="i{3}{4}"><label for="i{3}{4}"></label>'.format(
+        k,v.total_opted,v.total_missed,unique_id_prefix,inc_id)  
+    content += "<div class='g'>"  
+    content += inner_str
+    content += "</div> </div>\n"
+    return content   
+
+
+def jscript_head():
+    return '''
+    window.onload = function () {
+    var modal = document.getElementsByClassName("modal")[0];
+    var modal_close = document.getElementsByClassName("modal_close")[0];
+    var content = document.getElementsByClassName("modal__content")[0];
+    a_tags = document.getElementsByTagName("a");
+    base_href = document.getElementById("base_id").href;
+    for(i=0;i<a_tags.length;i++){
+        a_tags[i].addEventListener("click", function () {
+            var source = event.target || event.srcElement;
+            file_src = source.parentElement.parentElement.parentElement.parentElement.children[0].innerText ;
+            link = base_href + file_src+'#L'+ source.innerText;
+            window.open(link, '_blank');
+            
+        });
+    }
+    modal_close.addEventListener("click", function () {
+        content.innerHTML = '';
+        modal.className = 'modal';
+    });
+    
+    ''' 
+def jscipt_end():
+    return '''
+    tags = document.getElementsByTagName("li");
+    function escapeHtml(unsafe) {
+    return unsafe
+         .replace(/&/g, "&amp;")
+         .replace(/</g, "&lt;")
+         .replace(/>/g, "&gt;")
+         .replace(/"/g, "&quot;")
+         .replace(/'/g, "&#039;");
+    }
+    for (i = 0; i < tags.length; i++) {
+        tags[i].addEventListener("click", function () {
+            var source = event.target || event.srcElement;
+            funcs = source.dataset.fns.split(",")
+            strx = ''
+            //we saved differences,not real indices
+            last_ind = 0;
+            for (j = 0; j < funcs.length; j++) {
+                ind  = last_ind + parseInt(funcs[j]);
+                strx += "<p>" + escapeHtml(func_list[ind]) + "</p>";
+                last_ind = ind;
+            }
+            if (strx.length > 0) {
+                content.innerHTML = strx;
+                modal.className = 'modal open';
+            }
+
+        });
+    }
+ 
+    };'''
+
+def additional_tags(fsave):
+    if fsave==False:
+        return ''
+    #
+    return '''<script type='text/javascript'> 
+    var script = document.createElement('script'); script.src =  window.location.href+".js" ;
+    document.head.appendChild(script); 
+    </script>
+    <div class="modal">
+        <div class="modal__header">Functions <span class="modal_close">X</span></div>
+        <div class="modal__content"></div>
+        <div class="modal__footer">========</div>
+    </div>
+    '''
+
+def generate_report(output_name,info_ ,only_body = False, unique_id_prefix='',fsave_format = False , function_list = None):
+    '''
+      Generate Auto-Vectorization Report in html format
+    '''
+
+    temp_str =''    
+    if fsave_format ==True:
+        # we gonna dump function_list as key list sorted by value
+        #and use it as jscript array  
+        sorted_funcs_by_index = sorted(function_list.items(), key=lambda x: x[1])
+        del function_list
+        with open(output_name+ ".js","w") as f:
+            #temp_str =jscript_head() +'{ "fmaps":['
+            temp_str = jscript_head() + "\n var func_list = ["
+            for k,v in sorted_funcs_by_index:
+                #json.dumps using for escape
+                #print(str(v)+str(k))
+                temp_str+=json.dumps(get_cxx_filt_result(k))+","
+                #reduce write calls
+                if len(temp_str)>8192*2:
+                    f.write(temp_str)
+                    temp_str= ''
+            if len(temp_str)>0:
+                f.write(temp_str)
+            f.write('"-"];'+jscipt_end())
+            
+
+    temp_str = ''
+    with open(output_name,"w") as f:
+        if only_body==False:
+            f.write(header(fsave_format))
+            f.write(additional_tags(fsave_format))
+        nm=0
+        for k,v in sorted(info_.items()): # sorted(info_.items(), key=lambda x: x[1].total_opted, reverse=True):
+            temp_str += get_content(k,v,unique_id_prefix+str(nm),fsave_format)
+            #reduce io write calls
+            if len(temp_str)>8192:
+                f.write(temp_str)
+                temp_str =''
+            nm+=1
+        if len(temp_str)>0:
+            f.write(temp_str)
+        if only_body==False:
+            f.write(footer())           
+
+
+def fsave_report_launch(json_gz_list):
+
+        cpus = cpu_count()
+        if cpus>32:
+            cpus = 24
+
+        c_count = 1 # 2 i sufficient # if cpus<=1 else min(4,cpus)
+        p_count = 3 if cpus<=1 else max(8, cpus - c_count)
+
+        m = Manager()
+        #consumer Queues
+        list_Queue = [m.Queue() for index in range(0,c_count)]
+        with Pool(processes=c_count) as consumers:
+            #start consumers
+            cs = consumers.map_async(consume_processed_mp,[(list_Queue, index,) for index in range(0,c_count)])
+            with Pool(processes=p_count) as processors:
+                processors.map(process_gzip_json_mp, [(fname, list_Queue,) for fname in json_gz_list])
+                
+            #send ends to inform our consumers
+            #send ends
+            for q in list_Queue:
+                q.put(None)
+            
+            #wait for consumers
+            cs.wait()
+ 
+
+
+
+def main():
+    if "--fsave" in sys.argv:
+        json_gz_list = internal_glob(".","*.json.gz")
+        fsave_report_launch(json_gz_list)
+        return        
+    
+    file_info = obtain_info_from(sys.stdin)
+    if len(file_info)>0:
+        #print(file_info)
+        print("---generating vectorization html report--")
+        generate_report("vecmiss.html", file_info)
+    else:
+        # lets check if we got fsave files
+        json_gz_list = internal_glob(".","*.json.gz")
+        fsave_report_launch(json_gz_list)
+
+
+
+
+if __name__ == '__main__':
+    main()     
--- a/libnd4j/auto_vectorization/bigGzipJson.pyx
+++ b/libnd4j/auto_vectorization/bigGzipJson.pyx
@ -0,0 +1,354 @@
+'''
+@author : Abdelrauf rauf@konduit.ai
+Simple object xtractor form very big json files 
+'''
+
+import sys
+from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
+
+
+cdef char JSON_1 = b':'
+cdef char JSON_2 = b','
+cdef char JSON_3 = b'{'
+cdef char JSON_4 = b'}'
+cdef char JSON_5 = b'['
+cdef char JSON_6 = b']'
+cdef char QUOTE = b'"'
+cdef char ESCAPE = b"\\"
+cdef char SPACE = b' '
+cdef char TAB = b't'
+cdef char CR = b'\r'
+cdef char NL = b'\n'
+cdef char B = b'\b'
+cdef char EMPTY = b'\0'
+
+
+cdef struct Span:
+    int b
+    int e 
+
+cdef inline Span read_unquoted(char *text, int start,int end):
+    cdef Span sp
+    cdef int j = start
+    while j < end:
+        #if text[j].isspace():
+        if text[j] == SPACE or text[j] == NL or text[j] == TAB or text[j] == CR or text[j] == B:
+            j += 1
+            continue
+        if text[j] != QUOTE and text[j] != JSON_1 and text[j] != JSON_2 and text[j] != JSON_3 and text[j] != JSON_4 and text[j] != JSON_5 and text[j] != JSON_6:
+            start = j
+            j += 1
+            while j < end:
+                # read till JSON or white space
+                if text[j] == SPACE or text[j] == NL or text[j] == TAB or text[j] == CR or text[j] == B:
+                    sp.b = start
+                    sp.e = j
+                    return sp
+                elif text[j] == JSON_1 or text[j] == JSON_2 or text[j] == JSON_3 or text[j] == JSON_4 or text[j] == JSON_5 or text[j] == JSON_6:
+                    sp.b = start
+                    sp.e = j                 
+                    return sp
+                j += 1
+            if j == end-1:
+                sp.b = start
+                sp.e = end            
+                return sp
+        break
+    sp.b = j
+    sp.e = j
+    return sp
+
+
+cdef inline Span read_seq_token(char *text,int  start,int end):
+    #read quoted
+    #skip white_space
+    cdef Span sp    
+    cdef int j = start
+    cdef char last_char
+    cdef char char_x
+    while j < end:
+        if text[j] == SPACE or text[j] == NL or text[j] == TAB or text[j] == CR or text[j] == B:
+            j += 1
+            continue
+        if text[j] == QUOTE:
+            last_char = EMPTY
+            #read till another quote
+            start = j
+            j += 1
+            while j < end:
+                char_x = text[j]
+                if char_x == QUOTE and last_char != ESCAPE:
+                    # finished reading
+                    sp.b =start
+                    sp.e = j+1
+                    return sp
+                last_char = char_x
+                j += 1
+            if j == end-1:
+                sp.b = start
+                sp.e = end
+                return sp
+        else:
+            break
+    return read_unquoted(text, j, end)
+
+
+def tokenizer_spans(utext):
+    '''
+    we will just return tokenize spans
+    '''
+    token_spans = []
+    last_char = b''
+    end_i = len(utext)
+    cdef char *text = utext
+    i = 0
+    cdef Span sp
+    while i < end_i:
+        sp = read_seq_token(text, i, end_i)
+        i = sp.e
+        if sp.e > sp.b:
+            token_spans.append((sp.b, sp.e))
+        if i < end_i:
+            #if text[i] in JSON:
+            if text[i] == JSON_3 or text[i] == JSON_4 or text[i] == JSON_5 or text[i] == JSON_6 or text[i] == JSON_1 or text[i] == JSON_2:
+                token_spans.append((i, i+1))
+        i += 1
+    return token_spans
+
+
+
+
+
+cdef class JsonObjXtractor:
+    '''
+     JsonObjXtractor that utilize cython better
+    '''
+
+    cdef Span* token_spans
+    cdef size_t size      
+
+    def __cinit__(self, size_t count=4096): 
+        self.token_spans = <Span*> PyMem_Malloc(count * sizeof(Span))
+        self.size = count
+        if not self.token_spans:
+            raise MemoryError()
+
+
+    def __tokenizer_spans(self,utext, length):
+        '''
+        we will just return token  spans length
+        '''
+         
+        last_char = b''
+        end_i = length
+        cdef char *text = utext
+        cdef int i = 0
+        cdef size_t j = 0
+        cdef Span sp
+        while i < end_i:
+            sp = read_seq_token(text, i, end_i)
+            i = sp.e
+            if sp.e > sp.b:
+                self.token_spans[j] = sp
+                j+=1
+                if j>self.size:
+                    #we need to reallocate
+                    self.__resize(self.size+self.size//2)                
+            if i < end_i:
+                #if text[i] in JSON:
+                if text[i] == JSON_3 or text[i] == JSON_4 or text[i] == JSON_5 or text[i] == JSON_6 or text[i] == JSON_1 or text[i] == JSON_2:
+                    sp.b=i
+                    sp.e=i+1
+                    self.token_spans[j] = sp
+                    j+=1
+                    if j>self.size:
+                        #we need to reallocate
+                        self.__resize(self.size+self.size//2)
+            i += 1
+        return j
+
+
+
+    def try_extract_parent_obj(self, json_bytes, property_name, next_contains_value=b'', debug=False):
+        '''
+        try_extract_parent_obj(json_text, property_name, next_contains_value='', debug=False):
+        make sure that passed variables encoded to bytes with encode('utf-8')
+        next_contains_value either direct content or followed by '['
+        tries to extract the parent object for given named object
+        if the left brace of the parent object is outside of the current buffer
+        it will be ignored
+        if the right brace is outside of the buffer it will be left to be handled by caller
+        '''
+
+        look_for_the_left = True
+        parent_left = []
+        parent_right = []
+        parent_objects = []
+        len_next = len(next_contains_value) 
+        cdef int ind = 0
+        cdef int end
+        cdef int last_start = 0
+        property_name = b'"'+property_name+b'"'
+        cdef int lenx  = self.__tokenizer_spans(json_bytes,len(json_bytes))
+        cdef char x
+        cdef int i = -1
+        cdef Span sp
+        while i < lenx-1:
+            i += 1
+            ind = self.token_spans[i].b
+            x = json_bytes[ind]
+            #print("-----{0} -- {1} -- {2} ".format(x,parent_left,parent_right))
+            if look_for_the_left == False:
+                if x == JSON_3:
+                    parent_right.append(ind)
+                elif x == JSON_4:
+                    if len(parent_right) == 0:
+                        #we found parent closing brace
+                        look_for_the_left = True
+                        parent_objects.append((parent_left[-1], ind+1))
+                        last_start = ind+1
+                        #print("=============found {0}".format(parent_objects))
+                        parent_left = []
+                        parent_right = []
+                    else:
+                        parent_right.pop()
+                continue
+            #search obj
+            if look_for_the_left:
+                if x == JSON_3:
+                    parent_left.append(ind)
+                    last_start = ind
+                elif x == JSON_4:
+                    if len(parent_left) >= 1:
+                        #ignore
+                        parent_left.pop()
+
+            if x == JSON_1:  # ':'
+                #check to see if propertyname
+                old_property = EMPTY
+                if i > 1:
+                    sp = self.token_spans[i-1]
+                    old_property = json_bytes[sp.b:sp.e]
+                if old_property == property_name:
+                    #we found
+                    if len(parent_left) < 1:
+                        #left brace is outside of the buffer
+                        #we have to ignore it
+                        #try to increase buffer
+                        if debug:
+                            print('''left brace of the parent is outside of the buffer and parent is big.
+                            it will be ignored 
+                            try to choose disambiguous property names if you are looking for small objects''', file=sys.stderr)
+                        last_start = ind+1
+                        parent_left = []
+                        parent_right = []
+                        continue
+                    else:
+                        #print("++++++ look for the right brace")
+                        if len_next>0 and i+1 < lenx:
+                                i += 1
+                                ind = self.token_spans[i].b
+                                end = self.token_spans[i].e
+                                m = json_bytes[ind]
+                                
+                                if m == JSON_5:
+                                    #print ("----{0} {1}".format(m,JSON_5))
+                                    if i+1 < lenx:
+                                        i += 1
+                                        ind = self.token_spans[i].b
+                                        end = self.token_spans[i].e
+                                        #print ("----{0}  == {1}".format(next_contains_value,json_bytes[ind:end]))
+                                        if len_next <= end-ind and next_contains_value in json_bytes[ind:end]:
+                                            look_for_the_left = False
+                                            continue
+                                elif len_next <= end-ind and  next_contains_value in json_bytes[ind:end]:
+                                    look_for_the_left = False
+                                    continue
+
+                                #ignore as it does not have that value
+                                parent_left = []
+                                parent_right = []
+                                last_start = ind + 1
+                        else:
+                            look_for_the_left = False
+
+        # lets return last succesful opened brace as the last
+        # or  left brace failure case, safe closed brace
+        if len(parent_left)>0:
+            return (parent_objects, parent_left[-1])
+
+        return (parent_objects, last_start)
+
+
+
+    def __resize(self, size_t new_count):
+        cdef Span* mem = <Span*> PyMem_Realloc(self.token_spans, new_count * sizeof(Span))
+        if not mem:
+            raise MemoryError()
+        self.token_spans = mem
+        self.size = new_count
+
+    def __dealloc__(self):
+        PyMem_Free(self.token_spans) 
+
+
+
+import json
+import gzip
+import sys
+DEBUG_LOG = False
+
+def json_gzip_extract_objects(filename, property_name, next_contains_value=''):
+    strx = b''
+    started= False
+    b_next_contains_value = next_contains_value.encode('utf-8')
+    b_property_name = property_name.encode('utf-8')
+    #print(b_property_name)
+    objXt = JsonObjXtractor()
+    with gzip.open(filename, 'rb') as f:
+        if DEBUG_LOG:
+            print("opened {0}".format(filename), file=sys.stderr)
+        #instead of reading it as line, I will read it as binary bytes
+        is_End = False
+        #total = 0
+        while is_End==False:
+            buffer  = f.read(8192*2)
+            
+            lenx= len(buffer)
+            #total +=lenx
+            if lenx<1:
+                is_End = True
+            else:   
+                strx = strx + buffer
+
+            objects , last_index = objXt.try_extract_parent_obj(strx,b_property_name,b_next_contains_value)
+
+            # if b_property_name in strx and b_next_contains_value in strx:
+            #     print(strx)
+            #     print(objects)
+            #     print(last_index)
+            #     print("===================================================")
+
+            for start,end in objects:
+                yield  json.loads(strx[start:end]) #.decode('utf-8'))
+
+
+            #remove processed 
+            if last_index< len(strx):
+                strx = strx[last_index:]
+                
+            else:
+                strx = b''
+                #print('----+++')
+                
+            if(len(strx)>16384*3):
+                #buffer to big
+                #try to avoid big parents
+                if DEBUG_LOG:
+                    print("parent object is too big. please, look for better property name", file=sys.stderr)
+
+                break 
+
+
+
+
--- a/libnd4j/auto_vectorization/cython_setup.py
+++ b/libnd4j/auto_vectorization/cython_setup.py
@ -0,0 +1,3 @@
+from distutils.core import setup
+from Cython.Build import cythonize
+setup(ext_modules=cythonize("bigGzipJson.pyx",  language_level="3"))
--- a/libnd4j/blas/CMakeLists.txt
+++ b/libnd4j/blas/CMakeLists.txt
@ -282,6 +282,32 @@ elseif(CPU_BLAS)
        set_source_files_properties(../include/helpers/impl/OpTracker.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic")
    endif()

+    if(CHECK_VECTORIZATION)
+       set(VECT_FILES cpu/NativeOps.cpp ${OPS_SOURCES} ${HELPERS_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES} ${LOOPS_SOURCES})
+       if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+        
+        if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
+            set(CHECK_VECT_FLAGS "-ftree-vectorize -fsave-optimization-record")
+            #to process fsave-optimization-record we will need our cython version code
+            message("Build Auto vectorization helpers")
+            execute_process(COMMAND "python3" "${CMAKE_CURRENT_SOURCE_DIR}/../auto_vectorization/cython_setup.py" "build_ext" "--inplace" WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/../auto_vectorization/" RESULT_VARIABLE ret)
+            message("build='${ret}'")
+
+            #remove fail cases that gcc fails produce sometimes
+            file(GLOB_RECURSE FAILURE_CASES false ../include/loops/cpu/compilation_units/reduce3*.cpp)
+            #message("*****${FAILURE_CASES}")
+            foreach(FL_ITEM ${FAILURE_CASES})
+                message("Removing failure cases ${FL_ITEM}")
+                list(REMOVE_ITEM VECT_FILES ${FL_ITEM})
+            endforeach() 
+        else()
+            set(CHECK_VECT_FLAGS "-ftree-vectorize -fopt-info-vec-optimized-missed")
+        endif()
+        message("CHECK VECTORIZATION ${CHECK_VECT_FLAGS}")
+        set_source_files_properties( ${VECT_FILES}  PROPERTIES COMPILE_FLAGS "${CHECK_VECT_FLAGS}" )
+       endif()
+    endif()    
+
    message("CPU BLAS")
    add_definitions(-D__CPUBLAS__=true)
    add_library(nd4jobj OBJECT cpu/NativeOps.cpp cpu/GraphExecutioner.cpp
--- a/libnd4j/buildnativeoperations.sh
+++ b/libnd4j/buildnativeoperations.sh
@ -55,6 +55,7 @@ TESTS="false"
 VERBOSE="false"
 VERBOSE_ARG="VERBOSE=1"
 HELPER=
+CHECK_VECTORIZATION="OFF"
 NAME=
 while [[ $# > 0 ]]
 do
@ -114,6 +115,9 @@ case $key in
    NAME="$value"
    shift # past argument
    ;;
+    --check-vectorization)
+    CHECK_VECTORIZATION="ON"
+    ;;
    -j)
    MAKEJ="$value"
    shift # past argument
@ -528,14 +532,27 @@ echo MINIFIER = "${MINIFIER_ARG}"
 echo TESTS = "${TESTS_ARG}"
 echo NAME = "${NAME_ARG}"
 echo OPENBLAS_PATH = "$OPENBLAS_PATH"
+echo CHECK_VECTORIZATION = "$CHECK_VECTORIZATION"
 echo HELPERS = "$HELPERS"
 mkbuilddir
 pwd
-eval $CMAKE_COMMAND  "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" $HELPERS "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../..
+eval $CMAKE_COMMAND  "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" -DCHECK_VECTORIZATION="${CHECK_VECTORIZATION}"  $HELPERS "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../..
+
 if [ "$PARALLEL" == "true" ]; then
    MAKE_ARGUMENTS="$MAKE_ARGUMENTS -j $MAKEJ"
 fi
 if [ "$VERBOSE" == "true" ]; then
    MAKE_ARGUMENTS="$MAKE_ARGUMENTS $VERBOSE_ARG"
 fi
-eval $MAKE_COMMAND $MAKE_ARGUMENTS && cd ../../..
+
+if [ "$CHECK_VECTORIZATION" == "ON" ]; then
+
+if [ "$MAKE_COMMAND" == "make" ]; then
+    MAKE_ARGUMENTS="$MAKE_ARGUMENTS --output-sync=target"
+fi
+exec 3>&1 
+eval $MAKE_COMMAND $MAKE_ARGUMENTS 2>&1 >&3 3>&-  | python3 ../../auto_vectorization/auto_vect.py   && cd ../../..
+exec 3>&- 
+else
+eval $MAKE_COMMAND $MAKE_ARGUMENTS  && cd ../../..
+fi