-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecide_cutoff.py
executable file
·136 lines (112 loc) · 3.17 KB
/
decide_cutoff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# %%
import pandas as pd
import networkx as nx
from networkx.drawing.nx_pydot import write_dot
from semantic_group_discovery import find_clique_cover
from networkx.algorithms import bipartite
from scipy.stats import skew, kurtosis
from tabulate import tabulate
dfch = pd.read_csv("./assoc_full.csv").query("a1 != 'index' and a2 != 'index'")
dfch["key"] = dfch.apply(lambda r: tuple(sorted([r["a1"], r["a2"]])), axis=1)
dfdedup = dfch.drop_duplicates(subset=["key"])
print(f"Total de aristas: {len(dfdedup)}")
dfch2 = dfch.query("significant == True")
distinct_cramer = sorted(list(dfch2["cramer"].unique()))
len(set(dfch["a1"].unique()).union(set(dfch["a2"].unique())))
# %%
adj = pd.pivot_table(dfch2, index="a1", columns="a2", values="cramer").fillna(0)
gr = nx.from_pandas_adjacency(adj)
gr_full = nx.from_pandas_adjacency(
pd.pivot_table(dfch, index="a1", columns="a2", values="cramer").fillna(0)
)
# %%
len(gr_full.edges), len(gr.edges)
# %%
dsets = []
cut = None
cut_point = None
for c in distinct_cramer:
without = dfch2.query(f"cramer >= {c}")
g = nx.Graph()
for (i, r) in without.iterrows():
g.add_edge(r["a1"], r["a2"])
if nx.has_bridges(g):
print(len(list(nx.connected_components(g))), list(nx.bridges(g)), c)
cut = g
cut_point = c
break
# %%
cut_point
# %%
len(gr.edges)
# %%
len(cut.edges)
# %%
nx.max_weight_matching(gr)
# %%
write_dot(nx.maximum_spanning_tree(cut), "./cutoff_max_st.dot")
# %%
import itertools
between_pairs = {}
total_attributes = {}
sources = {}
targets = {}
cc = find_clique_cover(cut)[0]
ccvs = cc.keys()
for (sg, sg_elems) in cc.items():
subg = nx.maximum_spanning_tree(cut.subgraph(sg_elems))
print(sg, subg)
for (as1, as2) in itertools.combinations(ccvs, 2):
s1 = cc[as1]
s2 = cc[as2]
key = (as1, as2)
# between_pairs[key] = 0
for (s, t) in itertools.product(s1, s2):
if cut.has_edge(s, t):
sources[key] = sources.get(key, set()).union(set([s]))
targets[key] = targets.get(key, set()).union(set([t]))
between_pairs[key] = between_pairs.get(key, 0) + 1
sts = {}
for k in [*sources.keys(), *targets.keys()]:
sk = sources[k]
tk = targets[k]
sts[k] = min(len(sk), len(tk))
for (i, ((s, t), v)) in enumerate(sts.items()):
print(f"| {i+1} | {s} | {t} | {v} |")
# %%
g_spanning = nx.Graph()
for ((s, t), v) in between_pairs.items():
g_spanning.add_edge(s, t, weight=v, label=v)
write_dot(g_spanning, "./spanning.dot")
# %%
g_max_spanning = nx.maximum_spanning_tree(g_spanning)
write_dot(g_max_spanning, "./spanning_max.dot")
print(g_max_spanning.edges)
# %%
len(g_max_spanning.edges)
# %%
len(g_spanning.edges)
# %%
nx.max_weight_matching(g_spanning)
# %%
for (k, v) in cc.items():
subg = cut.subgraph(v)
if len(subg.nodes) > 1:
print(k, nx.min_edge_cover(subg))
# %%
dfdedup
# %%
print(f"Asimetría: {skew(dfdedup['cramer'])}\n Curtosis: {kurtosis(dfdedup['cramer'])}")
# %%
print(f"Total de aristas eliminadas = {len(dfdedup.query('cramer < 0.08'))}")
6 # %%
# %%
print(
tabulate(
pd.read_csv("./sg_groups.csv"),
headers="keys",
showindex="never",
tablefmt="github",
)
)
# %%