Spaces:

gautam-shetty
/

jRefactoring

Runtime error

App Files Files Community

jRefactoring / refactor_analysis.py

gautam-shetty

Initial commit

a5fb347 over 2 years ago

raw

history blame contribute delete

6.31 kB

	import os, subprocess, pydriller,json, pandas as pd
	import sys
	from dotenv import dotenv_values

	from Database import Database

	class RefactorAnalysis:

	def __init__(self,input_path="",output_path=""):
	if input_path=="":
	self.input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"data","refactoring-toy-example")
	else:
	self.input_path=input_path
	if output_path=="":
	self.output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"output_ref","output.json")
	else:
	self.output_path=output_path


	def generate_refactor_details(self):
	# ref_miner_bin = os.path.join(os.path.dirname(os.path.abspath(__file__)),"executable","RefactoringMiner","bin")
	ref_miner_bin = os.path.abspath("executable/RefactoringMiner/bin")
	# command = ["cd",ref_miner_bin,"&&","sh","RefactoringMiner","-a",self.input_path,"-json",self.output_path]
	command = ["sh","RefactoringMiner","-a",self.input_path,"-json",self.output_path]
	try:
	os.chdir(ref_miner_bin)
	shell_result = subprocess.run(command,capture_output=True,text=True)
	shell_result.check_returncode()
	# if shell_result!=0:
	# raise Exception("Couldn't analyze repository - "+self.input_path+" with RefactorMiner")
	# return 0
	except subprocess.CalledProcessError as error:
	print(error)
	sys.exit()

	except Exception as e:
	print(e)
	return 1

	def parse_json_output(self):
	#TODO
	#Filter for Method Refs
	with open(self.output_path) as f:
	json_output = json.load(f)


	dict_output = {}
	for obj in json_output["commits"]:
	if len(obj["refactorings"])==0:
	continue
	changes = []
	se_lines = []
	for ref in obj["refactorings"]:
	if not "Method" in ref["type"]:
	continue
	for parent_refs in ref["leftSideLocations"]:

	changes.append(parent_refs["filePath"])
	se_lines.append((parent_refs["startLine"],parent_refs["endLine"]))
	# list_output.append(dict_output)
	dict_output[obj["sha1"]]={
	"paths":changes,
	"ref_start_end":se_lines,
	"ref_type":ref["type"]
	}

	return dict_output

	def create_project_dataframe(self):

	df = pd.DataFrame(columns=['commit','refactoring_type','filename','meth_rf_neg','method_refactored'])

	parse_output_dict = self.parse_json_output()
	commits_to_analyze = list(parse_output_dict.keys())
	for commit in pydriller.Repository(self.input_path, only_commits=commits_to_analyze).traverse_commits():
	ref_list = parse_output_dict.get(commit.hash)
	ref_path_name = list(map(lambda x: str(x).split("/")[len(str(x).split("/"))-1],ref_list["paths"]))
	for cf in commit.modified_files:
	try:
	index_ref = ref_path_name.index(cf.filename)
	except ValueError as ve:
	continue
	if len(cf.changed_methods)==0:
	continue
	#Diff between methods_changed and methods_before - does methods_changed reduces loop else we have to loop for all methods
	for cm in cf.changed_methods:

	if cm.start_line<=ref_list["ref_start_end"][index_ref][0] and cm.end_line>=ref_list["ref_start_end"][index_ref][1]:
	method_source_code = self.__split_and_extract_methods(cf.source_code_before,cm.start_line,cm.end_line)
	method_source_code_neg = self.__split_and_extract_methods(cf.source_code,cm.start_line,cm.end_line)
	class_source_code = cf.source_code_before

	# df_row = {"commit":commit.hash,"refactoring_type":ref_list["ref_type"],"filename":cf.filename, "meth_rf_neg":class_source_code,"method_refactored":method_source_code}
	df_row = {"commit":commit.hash,"refactoring_type":ref_list["ref_type"],"filename":cf.filename, "meth_rf_neg":method_source_code_neg,"method_refactored":method_source_code}
	df.loc[len(df)] = df_row
	return df


	def __split_and_extract_methods(self, source_code,start_line, end_line):
	source_code_lines = str(source_code).splitlines()
	return "\n".join(source_code_lines[start_line-1:end_line])

	def main():
	if not os.path.exists("data/repos/"):
	try:
	print("Starting repo download")
	repo_script = subprocess.run(["python","repo_download.py"], capture_output=True, text=True)
	repo_script.check_returncode()
	except subprocess.CalledProcessError as err:
	print(err)
	sys.exit(1)
	print("Repo Download Completed")
	lst_repos = next(os.walk("data/repos/"))[1]
	print(len(lst_repos))

	cwd = os.path.dirname(os.path.abspath(__file__))
	final_df = pd.DataFrame(columns=['commit','refactoring_type','filename','meth_rf_neg','method_refactored'])
	database = Database(dotenv_values(".env")['COLLECTION_NAME'])
	# database.connect_db()
	count=1
	batch_size = 5
	for idx,repo in enumerate(lst_repos):
	os.chdir(cwd)
	try:
	ref_obj = RefactorAnalysis(os.path.abspath(os.path.join("data/repos",repo)),os.path.abspath(os.path.join("output_ref",repo+".json")))
	# ref_miner = ref_obj.generate_refactor_details() #Modify
	df = ref_obj.create_project_dataframe()
	except Exception as e:
	print(e)
	continue

	final_df = pd.concat([final_df,df], ignore_index=True)
	if count==batch_size or idx==len(lst_repos)-1:
	print("Inserting into DB", idx)
	database.insert_docs(final_df.to_dict(orient="records"))
	final_df = pd.DataFrame(columns=['commit','refactoring_type','filename','meth_rf_neg','method_refactored'])
	count=1
	else:
	count+=1

	if __name__=="__main__":
	main()