Skip to content

Commit e8f0856

Browse files
authored
Merge pull request #12 from saibalmars/speed_test
Provide choice of shortest path computation method
2 parents 86f33a5 + 4887215 commit e8f0856

File tree

1 file changed

+140
-57
lines changed

1 file changed

+140
-57
lines changed

GraphRicciCurvature/OllivierRicci.py

Lines changed: 140 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -44,23 +44,81 @@
4444
_exp_power = 2
4545
_proc = cpu_count()
4646
_cache_maxsize = 1000000
47-
_nbr_topk = 50
47+
_shortest_path = "all_pairs"
48+
_nbr_topk = 1000
49+
_apsp = {}
4850

4951
# -------------------------------------------------------
5052

53+
@lru_cache(_cache_maxsize)
54+
def _get_single_node_neighbors_distributions(node, direction="successors"):
55+
"""Get the neighbor density distribution of given node `node`.
56+
57+
Parameters
58+
----------
59+
node : int
60+
Node index in Networkit graph `_Gk`.
61+
direction : {"predecessors", "successors"}
62+
Direction of neighbors in directed graph. (Default value: "successors")
63+
64+
Returns
65+
-------
66+
distributions : lists of float
67+
Density distributions of neighbors up to top `_nbr_topk` nodes.
68+
nbrs : lists of int
69+
Neighbor index up to top `_nbr_topk` nodes.
70+
71+
"""
72+
if _Gk.isDirected():
73+
if direction == "predecessors":
74+
neighbors = _Gk.inNeighbors(node)
75+
else: # successors
76+
neighbors = _Gk.neighbors(node)
77+
else:
78+
neighbors = _Gk.neighbors(node)
79+
80+
# Get sum of distributions from x's all neighbors
81+
heap_weight_node_pair = []
82+
if direction == "predecessors":
83+
for nbr in neighbors:
84+
if len(heap_weight_node_pair) < _nbr_topk:
85+
heapq.heappush(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(nbr, node) ** _exp_power), nbr))
86+
else:
87+
heapq.heappushpop(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(nbr, node) ** _exp_power), nbr))
88+
else: # successors
89+
for nbr in neighbors:
90+
if len(heap_weight_node_pair) < _nbr_topk:
91+
heapq.heappush(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(node, nbr) ** _exp_power), nbr))
92+
else:
93+
heapq.heappushpop(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(node, nbr) ** _exp_power), nbr))
94+
95+
nbr_edge_weight_sum = sum([x[0] for x in heap_weight_node_pair])
96+
97+
if nbr_edge_weight_sum > EPSILON:
98+
# Sum need to be not too small to prevent divided by zero
99+
distributions = [(1.0 - _alpha) * w / nbr_edge_weight_sum for w, _ in heap_weight_node_pair]
100+
nbr = [x[1] for x in heap_weight_node_pair]
101+
return distributions + [_alpha], nbr + [node]
102+
elif len(neighbors) == 0:
103+
# No neighbor, all mass stay at node
104+
return [1], [node]
105+
else:
106+
logger.warning("Neighbor weight sum too small, list:", heap_weight_node_pair)
107+
distributions = [(1.0 - _alpha) / len(heap_weight_node_pair)] * len(heap_weight_node_pair)
108+
nbr = [x[1] for x in heap_weight_node_pair]
109+
return distributions + [_alpha], nbr + [node]
51110

52-
def _distribute_densities(source, target, nbr_topk=_nbr_topk):
111+
112+
def _distribute_densities(source, target):
53113
"""Get the density distributions of source and target node, and the cost (all pair shortest paths) between
54-
all source's and target's neighbors.
114+
all source's and target's neighbors. Notice that only neighbors with top `_nbr_topk` edge weights.
55115
56116
Parameters
57117
----------
58118
source : int
59119
Source node index in Networkit graph `_Gk`.
60120
target : int
61121
Target node index in Networkit graph `_Gk`.
62-
nbr_topk : int
63-
Only take the neighbors with top k edge weights.
64122
Returns
65123
-------
66124
x : (m,) numpy.ndarray
@@ -72,50 +130,14 @@ def _distribute_densities(source, target, nbr_topk=_nbr_topk):
72130
73131
"""
74132

75-
# Append source and target node into weight distribution matrix x,y
133+
# Distribute densities for source and source's neighbors as x
76134
if _Gk.isDirected():
77-
source_nbr = _Gk.inNeighbors(source)
135+
x, source_topknbr = _get_single_node_neighbors_distributions(source, "predecessors")
78136
else:
79-
source_nbr = _Gk.neighbors(source)
80-
target_nbr = _Gk.neighbors(target)
81-
82-
def _get_single_node_neighbors_distributions(node, neighbors, direction="successors"):
83-
# Get sum of distributions from x's all neighbors
84-
heap_weight_node_pair = []
85-
if direction == "predecessors":
86-
for nbr in neighbors:
87-
if len(heap_weight_node_pair) < nbr_topk:
88-
heapq.heappush(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(nbr, node) ** _exp_power), nbr))
89-
else:
90-
heapq.heappushpop(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(nbr, node) ** _exp_power), nbr))
91-
else: # successors
92-
for nbr in neighbors:
93-
if len(heap_weight_node_pair) < nbr_topk:
94-
heapq.heappush(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(node, nbr) ** _exp_power), nbr))
95-
else:
96-
heapq.heappushpop(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(node, nbr) ** _exp_power), nbr))
97-
98-
nbr_edge_weight_sum = sum([x[0] for x in heap_weight_node_pair])
99-
100-
if nbr_edge_weight_sum > EPSILON:
101-
# Sum need to be not too small to prevent divided by zero
102-
distributions = [(1.0 - _alpha) * w / nbr_edge_weight_sum for w, _ in heap_weight_node_pair]
103-
nbr = [x[1] for x in heap_weight_node_pair]
104-
return distributions + [_alpha], nbr + [node]
105-
elif len(neighbors) == 0:
106-
# No neighbor, all mass stay at node
107-
return [1], [node]
108-
else:
109-
logger.warning("Neighbor weight sum too small, list:", heap_weight_node_pair)
110-
distributions = [(1.0 - _alpha) / len(heap_weight_node_pair)] * len(heap_weight_node_pair)
111-
nbr = [x[1] for x in heap_weight_node_pair]
112-
return distributions + [_alpha], nbr + [node]
113-
114-
# Distribute densities for source and source's neighbors as x
115-
x, source_topknbr = _get_single_node_neighbors_distributions(source, source_nbr, "predecessors")
137+
x, source_topknbr = _get_single_node_neighbors_distributions(source, "successors")
116138

117139
# Distribute densities for target and target's neighbors as y
118-
y, target_topknbr = _get_single_node_neighbors_distributions(target, target_nbr, "successors")
140+
y, target_topknbr = _get_single_node_neighbors_distributions(target, "successors")
119141

120142
# construct the cost dictionary from x to y
121143
d = np.zeros((len(x), len(y)))
@@ -131,7 +153,7 @@ def _get_single_node_neighbors_distributions(node, neighbors, direction="success
131153

132154

133155
@lru_cache(_cache_maxsize)
134-
def _get_pairwise_sp(source, target):
156+
def _source_target_shortest_path(source, target):
135157
"""Compute pairwise shortest path from `source` to `target` by BidirectionalDijkstra via Networkit.
136158
137159
Parameters
@@ -153,6 +175,42 @@ def _get_pairwise_sp(source, target):
153175
return length
154176

155177

178+
def _get_pairwise_sp(source, target):
179+
"""Compute pairwise shortest path from `source` to `target`.
180+
181+
Parameters
182+
----------
183+
source : int
184+
Source node index in Networkit graph `_Gk`.
185+
target : int
186+
Target node index in Networkit graph `_Gk`.
187+
188+
Returns
189+
-------
190+
length : float
191+
Pairwise shortest path length.
192+
193+
"""
194+
195+
if _shortest_path == "pairwise":
196+
return _source_target_shortest_path(source, target)
197+
198+
return _apsp[source][target]
199+
200+
201+
def _get_all_pairs_shortest_path():
202+
"""Pre-compute all pairs shortest paths of the assigned graph `_Gk`."""
203+
logger.info("Start to compute all pair shortest path.")
204+
205+
global _Gk
206+
207+
t0 = time.time()
208+
apsp = nk.distance.APSP(_Gk).run().getDistances()
209+
logger.info("%8f secs for all pair by NetworKit." % (time.time() - t0))
210+
211+
return apsp
212+
213+
156214
def _optimal_transportation_distance(x, y, d):
157215
"""Compute the optimal transportation distance (OTD) of the given density distributions by CVXPY.
158216
@@ -287,12 +345,12 @@ def _compute_ricci_curvature_single_edge(source, target):
287345
assert _method in ["OTD", "ATD", "Sinkhorn"], \
288346
'Method %s not found, support method:["OTD", "ATD", "Sinkhorn"]' % _method
289347
if _method == "OTD":
290-
x, y, d = _distribute_densities(source, target, _nbr_topk)
348+
x, y, d = _distribute_densities(source, target)
291349
m = _optimal_transportation_distance(x, y, d)
292350
elif _method == "ATD":
293351
m = _average_transportation_distance(source, target)
294352
elif _method == "Sinkhorn":
295-
x, y, d = _distribute_densities(source, target, _nbr_topk)
353+
x, y, d = _distribute_densities(source, target)
296354
m = _sinkhorn_distance(x, y, d)
297355

298356
# compute Ricci curvature: k=1-(m_{x,y})/d(x,y)
@@ -310,7 +368,7 @@ def _wrap_compute_single_edge(stuff):
310368
def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
311369
alpha=0.5, method="OTD",
312370
base=math.e, exp_power=2, proc=cpu_count(), chunksize=None, cache_maxsize=1000000,
313-
nbr_topk=1000):
371+
shortest_path="all_pairs", nbr_topk=1000):
314372
"""Compute Ricci curvature for edges in given edge lists.
315373
316374
Parameters
@@ -344,6 +402,8 @@ def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
344402
cache_maxsize : int
345403
Max size for LRU cache for pairwise shortest path computation.
346404
Set this to `None` for unlimited cache. (Default value = 1000000)
405+
shortest_path : {"all_pairs","pairwise"}
406+
Method to compute shortest path. (Default value = `all_pairs`)
347407
nbr_topk : int
348408
Only take the top k edge weight neighbors for density distribution.
349409
Smaller k run faster but the result is less accurate. (Default value = 1000)
@@ -355,6 +415,9 @@ def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
355415
356416
"""
357417

418+
logger.info("Number of nodes: %d" % G.number_of_nodes())
419+
logger.info("Number of edges: %d" % G.number_of_edges())
420+
358421
if not nx.get_edge_attributes(G, weight):
359422
print('Edge weight not detected in graph, use "weight" as default edge weight.')
360423
for (v1, v2) in G.edges():
@@ -369,7 +432,9 @@ def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
369432
global _exp_power
370433
global _proc
371434
global _cache_maxsize
435+
global _shortest_path
372436
global _nbr_topk
437+
global _apsp
373438
# -------------------------------------------------------
374439

375440
_Gk = nk.nxadapter.nx2nk(G, weightAttr=weight)
@@ -380,6 +445,7 @@ def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
380445
_exp_power = exp_power
381446
_proc = proc
382447
_cache_maxsize = cache_maxsize
448+
_shortest_path = shortest_path
383449
_nbr_topk = nbr_topk
384450

385451
# Construct nx to nk dictionary
@@ -388,6 +454,11 @@ def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
388454
nx2nk_ndict[n] = idx
389455
nk2nx_ndict[idx] = n
390456

457+
if _shortest_path == "all_pairs":
458+
# Construct the all pair shortest path dictionary
459+
if not _apsp:
460+
_apsp = _get_all_pairs_shortest_path()
461+
391462
if edge_list:
392463
args = [(nx2nk_ndict[source], nx2nk_ndict[target]) for source, target in edge_list]
393464
else:
@@ -500,12 +571,11 @@ def _compute_ricci_flow(G: nx.Graph, weight="weight",
500571
G = nx.Graph(G.subgraph(max(nx.connected_components(G), key=len)))
501572
G.remove_edges_from(nx.selfloop_edges(G))
502573

503-
logger.info("Number of nodes: %d" % G.number_of_nodes())
504-
logger.info("Number of edges: %d" % G.number_of_edges())
505-
506574
# Set normalized weight to be the number of edges.
507575
normalized_weight = float(G.number_of_edges())
508576

577+
global _apsp
578+
509579
# Start compute edge Ricci flow
510580
t0 = time.time()
511581

@@ -517,6 +587,10 @@ def _compute_ricci_flow(G: nx.Graph, weight="weight",
517587
for (v1, v2) in G.edges():
518588
G[v1][v2]["original_RC"] = G[v1][v2]["ricciCurvature"]
519589

590+
# clear the APSP since the graph have changed.
591+
_apsp = {}
592+
593+
520594
# Start the Ricci flow process
521595
for i in range(iterations):
522596
for (v1, v2) in G.edges():
@@ -550,7 +624,10 @@ def _compute_ricci_flow(G: nx.Graph, weight="weight",
550624
normalized_weight = float(G.number_of_edges())
551625

552626
for n1, n2 in G.edges():
553-
logger.debug(n1, n2, G[n1][n2])
627+
logger.debug("%s %s %s" % (n1, n2, G[n1][n2]))
628+
629+
# clear the APSP since the graph have changed.
630+
_apsp = {}
554631

555632
logger.info("\n%8f secs for Ricci flow computation." % (time.time() - t0))
556633

@@ -564,7 +641,8 @@ class OllivierRicci:
564641
"""
565642

566643
def __init__(self, G: nx.Graph, weight="weight", alpha=0.5, method="OTD",
567-
base=math.e, exp_power=2, proc=cpu_count(), chunksize=None, cache_maxsize=1000000,
644+
base=math.e, exp_power=2, proc=cpu_count(), chunksize=None, shortest_path="all_pairs",
645+
cache_maxsize=1000000,
568646
nbr_topk=1000, verbose="ERROR"):
569647
"""Initialized a container to compute Ollivier-Ricci curvature/flow.
570648
@@ -596,13 +674,15 @@ def __init__(self, G: nx.Graph, weight="weight", alpha=0.5, method="OTD",
596674
Number of processor used for multiprocessing. (Default value = `cpu_count()`)
597675
chunksize : int
598676
Chunk size for multiprocessing, set None for auto decide. (Default value = `None`)
677+
shortest_path : {"all_pairs","pairwise"}
678+
Method to compute shortest path. (Default value = `all_pairs`)
599679
cache_maxsize : int
600680
Max size for LRU cache for pairwise shortest path computation.
601681
Set this to `None` for unlimited cache. (Default value = 1000000)
602682
nbr_topk : int
603683
Only take the top k edge weight neighbors for density distribution.
604684
Smaller k run faster but the result is less accurate. (Default value = 1000)
605-
verbose: {"INFO","DEBUG","ERROR"}
685+
verbose : {"INFO","DEBUG","ERROR"}
606686
Verbose level. (Default value = "ERROR")
607687
- "INFO": show only iteration process log.
608688
- "DEBUG": show all output logs.
@@ -618,6 +698,7 @@ def __init__(self, G: nx.Graph, weight="weight", alpha=0.5, method="OTD",
618698
self.proc = proc
619699
self.chunksize = chunksize
620700
self.cache_maxsize = cache_maxsize
701+
self.shortest_path = shortest_path
621702
self.nbr_topk = nbr_topk
622703

623704
self.set_verbose(verbose)
@@ -658,7 +739,8 @@ def compute_ricci_curvature_edges(self, edge_list=None):
658739
alpha=self.alpha, method=self.method,
659740
base=self.base, exp_power=self.exp_power,
660741
proc=self.proc, chunksize=self.chunksize,
661-
cache_maxsize=self.cache_maxsize, nbr_topk=self.nbr_topk)
742+
cache_maxsize=self.cache_maxsize, shortest_path=self.shortest_path,
743+
nbr_topk=self.nbr_topk)
662744

663745
def compute_ricci_curvature(self):
664746
"""Compute Ricci curvature of edges and nodes.
@@ -684,6 +766,7 @@ def compute_ricci_curvature(self):
684766
alpha=self.alpha, method=self.method,
685767
base=self.base, exp_power=self.exp_power,
686768
proc=self.proc, chunksize=self.chunksize, cache_maxsize=self.cache_maxsize,
769+
shortest_path = self.shortest_path,
687770
nbr_topk=self.nbr_topk)
688771
return self.G
689772

@@ -724,5 +807,5 @@ def compute_ricci_flow(self, iterations=10, step=1, delta=1e-4, surgery=(lambda
724807
alpha=self.alpha, method=self.method,
725808
base=self.base, exp_power=self.exp_power,
726809
proc=self.proc, chunksize=self.chunksize, cache_maxsize=self.cache_maxsize,
727-
nbr_topk=self.nbr_topk)
810+
shortest_path=self.shortest_path, nbr_topk=self.nbr_topk)
728811
return self.G

0 commit comments

Comments
 (0)