Merge pull request #12 from saibalmars/speed_test

saibalmars · web-flow · commit e8f08567fde8 · 2020-03-23T23:07:25.000-07:00
Provide choice of shortest path computation method
diff --git a/GraphRicciCurvature/OllivierRicci.py b/GraphRicciCurvature/OllivierRicci.py
@@ -44,23 +44,81 @@
 _exp_power = 2
 _proc = cpu_count()
 _cache_maxsize = 1000000
-_nbr_topk = 50
+_shortest_path = "all_pairs"
+_nbr_topk = 1000
+_apsp = {}
 
 # -------------------------------------------------------
 
+@lru_cache(_cache_maxsize)
+def _get_single_node_neighbors_distributions(node, direction="successors"):
+    """Get the neighbor density distribution of given node `node`.
+
+    Parameters
+    ----------
+    node : int
+        Node index in Networkit graph `_Gk`.
+    direction : {"predecessors", "successors"}
+        Direction of neighbors in directed graph. (Default value: "successors")
+
+    Returns
+    -------
+    distributions : lists of float
+        Density distributions of neighbors up to top `_nbr_topk` nodes.
+    nbrs : lists of int
+        Neighbor index up to top `_nbr_topk` nodes.
+
+    """
+    if _Gk.isDirected():
+        if direction == "predecessors":
+            neighbors = _Gk.inNeighbors(node)
+        else:  # successors
+            neighbors = _Gk.neighbors(node)
+    else:
+        neighbors = _Gk.neighbors(node)
+
+    # Get sum of distributions from x's all neighbors
+    heap_weight_node_pair = []
+    if direction == "predecessors":
+        for nbr in neighbors:
+            if len(heap_weight_node_pair) < _nbr_topk:
+                heapq.heappush(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(nbr, node) ** _exp_power), nbr))
+            else:
+                heapq.heappushpop(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(nbr, node) ** _exp_power), nbr))
+    else:  # successors
+        for nbr in neighbors:
+            if len(heap_weight_node_pair) < _nbr_topk:
+                heapq.heappush(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(node, nbr) ** _exp_power), nbr))
+            else:
+                heapq.heappushpop(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(node, nbr) ** _exp_power), nbr))
+
+    nbr_edge_weight_sum = sum([x[0] for x in heap_weight_node_pair])
+
+    if nbr_edge_weight_sum > EPSILON:
+        # Sum need to be not too small to prevent divided by zero
+        distributions = [(1.0 - _alpha) * w / nbr_edge_weight_sum for w, _ in heap_weight_node_pair]
+        nbr = [x[1] for x in heap_weight_node_pair]
+        return distributions + [_alpha], nbr + [node]
+    elif len(neighbors) == 0:
+        # No neighbor, all mass stay at node
+        return [1], [node]
+    else:
+        logger.warning("Neighbor weight sum too small, list:", heap_weight_node_pair)
+        distributions = [(1.0 - _alpha) / len(heap_weight_node_pair)] * len(heap_weight_node_pair)
+        nbr = [x[1] for x in heap_weight_node_pair]
+        return distributions + [_alpha], nbr + [node]
 
-def _distribute_densities(source, target, nbr_topk=_nbr_topk):
+
+def _distribute_densities(source, target):
     """Get the density distributions of source and target node, and the cost (all pair shortest paths) between
-    all source's and target's neighbors.
+    all source's and target's neighbors. Notice that only neighbors with top `_nbr_topk` edge weights.
 
     Parameters
     ----------
     source : int
         Source node index in Networkit graph `_Gk`.
     target : int
         Target node index in Networkit graph `_Gk`.
-    nbr_topk : int
-        Only take the neighbors with top k edge weights.
     Returns
     -------
     x : (m,) numpy.ndarray
@@ -72,50 +130,14 @@ def _distribute_densities(source, target, nbr_topk=_nbr_topk):
 
     """
 
-    # Append source and target node into weight distribution matrix x,y
+    # Distribute densities for source and source's neighbors as x
     if _Gk.isDirected():
-        source_nbr = _Gk.inNeighbors(source)
+        x, source_topknbr = _get_single_node_neighbors_distributions(source, "predecessors")
     else:
-        source_nbr = _Gk.neighbors(source)
-    target_nbr = _Gk.neighbors(target)
-
-    def _get_single_node_neighbors_distributions(node, neighbors, direction="successors"):
-        # Get sum of distributions from x's all neighbors
-        heap_weight_node_pair = []
-        if direction == "predecessors":
-            for nbr in neighbors:
-                if len(heap_weight_node_pair) < nbr_topk:
-                    heapq.heappush(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(nbr, node) ** _exp_power), nbr))
-                else:
-                    heapq.heappushpop(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(nbr, node) ** _exp_power), nbr))
-        else:  # successors
-            for nbr in neighbors:
-                if len(heap_weight_node_pair) < nbr_topk:
-                    heapq.heappush(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(node, nbr) ** _exp_power), nbr))
-                else:
-                    heapq.heappushpop(heap_weight_node_pair, (_base ** (-_get_pairwise_sp(node, nbr) ** _exp_power), nbr))
-
-        nbr_edge_weight_sum = sum([x[0] for x in heap_weight_node_pair])
-
-        if nbr_edge_weight_sum > EPSILON:
-            # Sum need to be not too small to prevent divided by zero
-            distributions = [(1.0 - _alpha) * w / nbr_edge_weight_sum for w, _ in heap_weight_node_pair]
-            nbr = [x[1] for x in heap_weight_node_pair]
-            return distributions + [_alpha], nbr + [node]
-        elif len(neighbors) == 0:
-            # No neighbor, all mass stay at node
-            return [1], [node]
-        else:
-            logger.warning("Neighbor weight sum too small, list:", heap_weight_node_pair)
-            distributions = [(1.0 - _alpha) / len(heap_weight_node_pair)] * len(heap_weight_node_pair)
-            nbr = [x[1] for x in heap_weight_node_pair]
-            return distributions + [_alpha], nbr + [node]
-
-    # Distribute densities for source and source's neighbors as x
-    x, source_topknbr = _get_single_node_neighbors_distributions(source, source_nbr, "predecessors")
+        x, source_topknbr = _get_single_node_neighbors_distributions(source, "successors")
 
     # Distribute densities for target and target's neighbors as y
-    y, target_topknbr = _get_single_node_neighbors_distributions(target, target_nbr, "successors")
+    y, target_topknbr = _get_single_node_neighbors_distributions(target, "successors")
 
     # construct the cost dictionary from x to y
     d = np.zeros((len(x), len(y)))
@@ -131,7 +153,7 @@ def _get_single_node_neighbors_distributions(node, neighbors, direction="success
 
 
 @lru_cache(_cache_maxsize)
-def _get_pairwise_sp(source, target):
+def _source_target_shortest_path(source, target):
     """Compute pairwise shortest path from `source` to `target` by BidirectionalDijkstra via Networkit.
 
     Parameters
@@ -153,6 +175,42 @@ def _get_pairwise_sp(source, target):
     return length
 
 
+def _get_pairwise_sp(source, target):
+    """Compute pairwise shortest path from `source` to `target`.
+
+    Parameters
+    ----------
+    source : int
+        Source node index in Networkit graph `_Gk`.
+    target : int
+        Target node index in Networkit graph `_Gk`.
+
+    Returns
+    -------
+    length : float
+        Pairwise shortest path length.
+
+    """
+
+    if _shortest_path == "pairwise":
+        return _source_target_shortest_path(source, target)
+
+    return _apsp[source][target]
+
+
+def _get_all_pairs_shortest_path():
+    """Pre-compute all pairs shortest paths of the assigned graph `_Gk`."""
+    logger.info("Start to compute all pair shortest path.")
+
+    global _Gk
+
+    t0 = time.time()
+    apsp = nk.distance.APSP(_Gk).run().getDistances()
+    logger.info("%8f secs for all pair by NetworKit." % (time.time() - t0))
+
+    return apsp
+
+
 def _optimal_transportation_distance(x, y, d):
     """Compute the optimal transportation distance (OTD) of the given density distributions by CVXPY.
 
@@ -287,12 +345,12 @@ def _compute_ricci_curvature_single_edge(source, target):
     assert _method in ["OTD", "ATD", "Sinkhorn"], \
         'Method %s not found, support method:["OTD", "ATD", "Sinkhorn"]' % _method
     if _method == "OTD":
-        x, y, d = _distribute_densities(source, target, _nbr_topk)
+        x, y, d = _distribute_densities(source, target)
         m = _optimal_transportation_distance(x, y, d)
     elif _method == "ATD":
         m = _average_transportation_distance(source, target)
     elif _method == "Sinkhorn":
-        x, y, d = _distribute_densities(source, target, _nbr_topk)
+        x, y, d = _distribute_densities(source, target)
         m = _sinkhorn_distance(x, y, d)
 
     # compute Ricci curvature: k=1-(m_{x,y})/d(x,y)
@@ -310,7 +368,7 @@ def _wrap_compute_single_edge(stuff):
 def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
                                    alpha=0.5, method="OTD",
                                    base=math.e, exp_power=2, proc=cpu_count(), chunksize=None, cache_maxsize=1000000,
-                                   nbr_topk=1000):
+                                   shortest_path="all_pairs", nbr_topk=1000):
     """Compute Ricci curvature for edges in  given edge lists.
 
     Parameters
@@ -344,6 +402,8 @@ def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
     cache_maxsize : int
         Max size for LRU cache for pairwise shortest path computation.
         Set this to `None` for unlimited cache. (Default value = 1000000)
+    shortest_path : {"all_pairs","pairwise"}
+        Method to compute shortest path. (Default value = `all_pairs`)
     nbr_topk : int
         Only take the top k edge weight neighbors for density distribution.
         Smaller k run faster but the result is less accurate. (Default value = 1000)
@@ -355,6 +415,9 @@ def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
 
     """
 
+    logger.info("Number of nodes: %d" % G.number_of_nodes())
+    logger.info("Number of edges: %d" % G.number_of_edges())
+
     if not nx.get_edge_attributes(G, weight):
         print('Edge weight not detected in graph, use "weight" as default edge weight.')
         for (v1, v2) in G.edges():
@@ -369,7 +432,9 @@ def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
     global _exp_power
     global _proc
     global _cache_maxsize
+    global _shortest_path
     global _nbr_topk
+    global _apsp
     # -------------------------------------------------------
 
     _Gk = nk.nxadapter.nx2nk(G, weightAttr=weight)
@@ -380,6 +445,7 @@ def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
     _exp_power = exp_power
     _proc = proc
     _cache_maxsize = cache_maxsize
+    _shortest_path = shortest_path
     _nbr_topk = nbr_topk
 
     # Construct nx to nk dictionary
@@ -388,6 +454,11 @@ def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
         nx2nk_ndict[n] = idx
         nk2nx_ndict[idx] = n
 
+    if _shortest_path == "all_pairs":
+        # Construct the all pair shortest path dictionary
+        if not _apsp:
+            _apsp = _get_all_pairs_shortest_path()
+
     if edge_list:
         args = [(nx2nk_ndict[source], nx2nk_ndict[target]) for source, target in edge_list]
     else:
@@ -500,12 +571,11 @@ def _compute_ricci_flow(G: nx.Graph, weight="weight",
         G = nx.Graph(G.subgraph(max(nx.connected_components(G), key=len)))
     G.remove_edges_from(nx.selfloop_edges(G))
 
-    logger.info("Number of nodes: %d" % G.number_of_nodes())
-    logger.info("Number of edges: %d" % G.number_of_edges())
-
     # Set normalized weight to be the number of edges.
     normalized_weight = float(G.number_of_edges())
 
+    global _apsp
+
     # Start compute edge Ricci flow
     t0 = time.time()
 
@@ -517,6 +587,10 @@ def _compute_ricci_flow(G: nx.Graph, weight="weight",
         for (v1, v2) in G.edges():
             G[v1][v2]["original_RC"] = G[v1][v2]["ricciCurvature"]
 
+        # clear the APSP since the graph have changed.
+        _apsp = {}
+
+
     # Start the Ricci flow process
     for i in range(iterations):
         for (v1, v2) in G.edges():
@@ -550,7 +624,10 @@ def _compute_ricci_flow(G: nx.Graph, weight="weight",
             normalized_weight = float(G.number_of_edges())
 
         for n1, n2 in G.edges():
-            logger.debug(n1, n2, G[n1][n2])
+            logger.debug("%s %s %s" % (n1, n2, G[n1][n2]))
+
+        # clear the APSP since the graph have changed.
+        _apsp = {}
 
     logger.info("\n%8f secs for Ricci flow computation." % (time.time() - t0))
 
@@ -564,7 +641,8 @@ class OllivierRicci:
     """
 
     def __init__(self, G: nx.Graph, weight="weight", alpha=0.5, method="OTD",
-                 base=math.e, exp_power=2, proc=cpu_count(), chunksize=None, cache_maxsize=1000000,
+                 base=math.e, exp_power=2, proc=cpu_count(), chunksize=None, shortest_path="all_pairs",
+                 cache_maxsize=1000000,
                  nbr_topk=1000, verbose="ERROR"):
         """Initialized a container to compute Ollivier-Ricci curvature/flow.
 
@@ -596,13 +674,15 @@ def __init__(self, G: nx.Graph, weight="weight", alpha=0.5, method="OTD",
             Number of processor used for multiprocessing. (Default value = `cpu_count()`)
         chunksize : int
             Chunk size for multiprocessing, set None for auto decide. (Default value = `None`)
+        shortest_path : {"all_pairs","pairwise"}
+            Method to compute shortest path. (Default value = `all_pairs`)
         cache_maxsize : int
             Max size for LRU cache for pairwise shortest path computation.
             Set this to `None` for unlimited cache. (Default value = 1000000)
         nbr_topk : int
             Only take the top k edge weight neighbors for density distribution.
             Smaller k run faster but the result is less accurate. (Default value = 1000)
-        verbose: {"INFO","DEBUG","ERROR"}
+        verbose : {"INFO","DEBUG","ERROR"}
             Verbose level. (Default value = "ERROR")
                 - "INFO": show only iteration process log.
                 - "DEBUG": show all output logs.
@@ -618,6 +698,7 @@ def __init__(self, G: nx.Graph, weight="weight", alpha=0.5, method="OTD",
         self.proc = proc
         self.chunksize = chunksize
         self.cache_maxsize = cache_maxsize
+        self.shortest_path = shortest_path
         self.nbr_topk = nbr_topk
 
         self.set_verbose(verbose)
@@ -658,7 +739,8 @@ def compute_ricci_curvature_edges(self, edge_list=None):
                                               alpha=self.alpha, method=self.method,
                                               base=self.base, exp_power=self.exp_power,
                                               proc=self.proc, chunksize=self.chunksize,
-                                              cache_maxsize=self.cache_maxsize, nbr_topk=self.nbr_topk)
+                                              cache_maxsize=self.cache_maxsize, shortest_path=self.shortest_path,
+                                              nbr_topk=self.nbr_topk)
 
     def compute_ricci_curvature(self):
         """Compute Ricci curvature of edges and nodes.
@@ -684,6 +766,7 @@ def compute_ricci_curvature(self):
                                           alpha=self.alpha, method=self.method,
                                           base=self.base, exp_power=self.exp_power,
                                           proc=self.proc, chunksize=self.chunksize, cache_maxsize=self.cache_maxsize,
+                                          shortest_path = self.shortest_path,
                                           nbr_topk=self.nbr_topk)
         return self.G
 
@@ -724,5 +807,5 @@ def compute_ricci_flow(self, iterations=10, step=1, delta=1e-4, surgery=(lambda
                                      alpha=self.alpha, method=self.method,
                                      base=self.base, exp_power=self.exp_power,
                                      proc=self.proc, chunksize=self.chunksize, cache_maxsize=self.cache_maxsize,
-                                     nbr_topk=self.nbr_topk)
+                                     shortest_path=self.shortest_path, nbr_topk=self.nbr_topk)
         return self.G