asammoud
Re-add large CSVs using Git LFS
b265364
# preprocess data
import numpy as np
import re
def get_most_common_features(target, all_features, max = 3, min = 3):
res = []
main_keys = target.split('_')
for feature in all_features:
if target == feature:
continue
f_keys = feature.split('_')
common_key_num = len(list(set(f_keys) & set(main_keys)))
if common_key_num >= min and common_key_num <= max:
res.append(feature)
return res
def build_net(target, all_features):
# get edge_indexes, and index_feature_map
main_keys = target.split('_')
edge_indexes = [
[],
[]
]
index_feature_map = [target]
# find closest features(nodes):
parent_list = [target]
graph_map = {}
depth = 2
for i in range(depth):
for feature in parent_list:
children = get_most_common_features(feature, all_features)
if feature not in graph_map:
graph_map[feature] = []
# exclude parent
pure_children = []
for child in children:
if child not in graph_map:
pure_children.append(child)
graph_map[feature] = pure_children
if feature not in index_feature_map:
index_feature_map.append(feature)
p_index = index_feature_map.index(feature)
for child in pure_children:
if child not in index_feature_map:
index_feature_map.append(child)
c_index = index_feature_map.index(child)
edge_indexes[1].append(p_index)
edge_indexes[0].append(c_index)
parent_list = pure_children
return edge_indexes, index_feature_map
def construct_data(data, feature_map, labels=0):
res = []
for feature in feature_map:
if feature in data.columns:
res.append(data.loc[:, feature].values.tolist())
else:
print(feature, 'not exist in data')
# append labels as last
sample_n = len(res[0])
if type(labels) == int:
res.append([labels]*sample_n)
elif len(labels) == sample_n:
res.append(labels)
return res
def build_loc_net(struc, all_features, feature_map=[]):
index_feature_map = feature_map
edge_indexes = [
[],
[]
]
for node_name, node_list in struc.items():
if node_name not in all_features:
continue
if node_name not in index_feature_map:
index_feature_map.append(node_name)
p_index = index_feature_map.index(node_name)
for child in node_list:
if child not in all_features:
continue
if child not in index_feature_map:
print(f'error: {child} not in index_feature_map')
# index_feature_map.append(child)
c_index = index_feature_map.index(child)
# edge_indexes[0].append(p_index)
# edge_indexes[1].append(c_index)
edge_indexes[0].append(c_index)
edge_indexes[1].append(p_index)
return edge_indexes