#!/usr/bin/env python
import os
import sysimport os.pathdef find_import(line):
line=line.strip() IMPORT_CMD="import " if not line.startswith(IMPORT_CMD): return None line=line[len(IMPORT_CMD):].strip() line=line.strip(';') parts=line.split("{") if len(parts)==1: return parts head=parts[0] parts=parts[1].strip("}") parts=parts.split(",") parts=["%s%s"%(head, part.strip()) for part in parts] return parts def import2path(roots, import_name): spath = import_name.replace('.', '/') for root in roots: fpath=os.path.join(root, "%s.java"%spath) if os.path.isfile(fpath): return fpath fpath=os.path.join(root, "%s.scala"%spath) if os.path.isfile(fpath): return fpath return Nonedef file_info(fpath):
f=open(fpath, "r") lines=f.readlines() f.close() lines=[line.strip() for line in lines if line.strip()!=""] imports=[] for line in lines: import_array = find_import(line) if import_array != None: imports.extend(import_array) return ( len(lines) - len(imports), imports)def collect_file_info(collected, roots, entry_name):
if entry_name in collected: return fpath=import2path(roots, entry_name) if fpath==None: collected[entry_name]=None return if fpath in collected: return info=file_info(fpath) collected[fpath]=info[0] for import_name in info[1]: collect_file_info(collected, roots, import_name) def collect_ref_info(roots, entry_names): collect_info={} for entry_name in entry_names: collect_file_info(collect_info, roots, entry_name) return collect_infodef show_files_with_lines(files, title):
print("=============== %s ================="%title) lines_total=0 files_total=0 for f in files: lines_total=f[1]+lines_total files_total=files_total+1 print("%s:%s"%(f[0], f[1])) print("=============== total lines = %d,total files = %d ================="%(lines_total,files_total))def show_files(files, title): print("=============== %s ================="%title) for f in files: print(f)if __name__== "__main__":
roots=open(sys.argv[1]).readlines() roots=[root.strip() for root in roots if root.strip()!=""] entry_names=open(sys.argv[2]).readlines() entry_names=[entry_name.strip() for entry_name in entry_names if entry_name.strip()!=""]ref_info = collect_ref_info(roots, entry_names)
in_files=[item for item in ref_info.items() if item[1]!=None] out_files=[item[0] for item in ref_info.items() if item[1]==None]spark_not_found=[f for f in out_files if f.startswith("org.apache.spark.")]
spark_not_found.sort()hadoop_files=[f for f in out_files if f.startswith("org.apache.hadoop.")]
hadoop_files.sort() other_files=list(set(out_files) - set(spark_not_found) - set(hadoop_files)) other_files.sort()show_files_with_lines(in_files, "spark source")
show_files(spark_not_found, "spark import name not file name") show_files(hadoop_files, "hadoop ref") show_files(other_files, "others ref")