rename & update readme

ponythewhite · ponythewhite · commit b8ba42b5cc10 · 2024-11-24T20:59:42.000+01:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "cleora-python"
+name = "pycleora"
 version = "2.0.0"
 edition = "2018"
 license-file = "LICENSE"
@@ -8,7 +8,7 @@ documentation = "https://github.com/synerise/cleora"
 homepage = "https://github.com/synerise/cleora"
 repository = "https://github.com/synerise/cleora"
 description = """
-Sparse graph structure and markov-propagation on embeddings exposed via python bindings
+Sparse hypergraph structure and markov-propagation for node embeddings embeddings exposed via Python bindings.
 """
 
 [lib]
diff --git a/README.md b/README.md
@@ -21,6 +21,81 @@ _**Cleora** is a genus of moths in the family **Geometridae**. Their scientific
 
 Cleora is a general-purpose model for efficient, scalable learning of stable and inductive entity embeddings for heterogeneous relational data.
 
+**Cleora** is now available as a python package _pycleora_. Key improvements compared to the previous version:
+* _performance optimizations_: 10x faster embedding times
+* _performance optimizations_: reduced memory usage
+* _latest research_: significantly improved embedding quality
+* _new feature_: can create graphs from a Python iterator in addition to tsv files
+* _new feature_: seamless integration with _NumPy_
+* _new feature_: item attributes support via custom embeddings initialization
+* _new feature_: adjustable vector projection / normalization after each propagation step
+
+**Breaking changes:**
+* _transient_ modifier not supported any more - creating _complex::reflexive_ columns for hypergraph embeddings, grouped by the transient entity gives better results.
+
+
+**Example usage:**
+
+```
+import pycleora
+import numpy as np
+import pandas as pd
+import random
+
+# Generate example data
+customers = [f"Customer_{i}" for i in range(1, 20)]
+products = [f"Product_{j}" for j in range(1, 20)]
+
+data = {
+    "customer": random.choices(customers, k=100),
+    "product": random.choices(products, k=100),
+}
+
+# Create DataFrame
+df = pd.DataFrame(data)
+
+# Create hyperedges
+customer_products = df.groupby('customer')['product'].apply(list).values
+
+# Convert to Cleora input format
+cleora_input = map(lambda x: ' '.join(x), customer_products)
+
+# Create Markov transition matrix for the hypergraph
+mat = pycleora.SparseMatrix.from_iterator(cleora_input, columns='complex::reflexive::product')
+
+# Look at entity ids in the matrix, corresponding to embedding vectors
+print(mat.entity_ids)
+# ['Product_5', 'Product_3', 'Product_2', 'Product_4', 'Product_1']
+
+# Initialize embedding vectors externally, using text, image, random vectors
+# embeddings = ...
+
+# Or use built-in random deterministic initialization
+embeddings = mat.initialize_deterministically(1024)
+
+# Perform Markov random walk, then normalize however many times we want
+
+NUM_WALKS = 3   # The optimal number depends on the graph, typically between 3 and 7 yields good results
+                # lower values tend to capture co-occurrence, higher iterations capture substitutability in a context
+
+for i in range(NUM_WALKS):
+    # Can propagate with a symmetric matrix as well, but left Markov is a great default
+    embeddings = mat.left_markov_propagate(embeddings)
+    # Normalize with L2 norm by default, for the embeddings to reside on a hypersphere. Can use standardization instead.
+    embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True)
+
+# We're done, here are our embeddings
+
+for entity, embedding in zip(mat.entity_ids, embeddings):
+    print(entity, embedding)
+
+# We can now compare our embeddings with dot product (since they are L2 normalized)
+
+print(np.dot(embeddings[0], embeddings[1]))
+print(np.dot(embeddings[0], embeddings[2]))
+print(np.dot(embeddings[0], embeddings[3]))
+```
+
 **Read the whitepaper ["Cleora: A Simple, Strong and Scalable Graph Embedding Scheme"](https://arxiv.org/abs/2102.02302)**
 
 Cleora embeds entities in *n-dimensional spherical spaces* utilizing extremely fast stable, iterative random projections, which allows for unparalleled performance and scalability. 
@@ -166,14 +241,6 @@ The technical properties described above imply good production-readiness of Cleo
 
 More information can be found in [the full documentation](https://cleora.readthedocs.io/).
 
-## Cleora Enterprise
-**Cleora Enterprise** is now available for selected customers. Key improvements in addition to this open-source version:
-* _performance optimizations_: 10x faster embedding times
-* _latest research_: significantly improved embedding quality
-* _new feature_: item attributes support
-* _new feature_: multimodal fusion of multiple graphs, text and image embeddings
-* _new feature_: compressed embeddings in various formats (spherical, hyperbolic, sparse)
-
 For details contact us at cleora@synerise.com
 
 ## Cite
diff --git a/examples/cleora_loop.py b/examples/cleora_loop.py
@@ -1,7 +1,7 @@
 import time
 
 import numpy as np
-from cleora_python import SparseMatrix
+from pycleora import SparseMatrix
 
 start_time = time.time()
 
diff --git a/examples/column_indices.py b/examples/column_indices.py
@@ -1,5 +1,5 @@
 import numpy as np
-from cleora_python import SparseMatrix
+from pycleora import SparseMatrix
 
 hyperedges = [
     'a\t1',
diff --git a/examples/from_iterator.py b/examples/from_iterator.py
@@ -1,7 +1,7 @@
 import time
 
 import numpy as np
-from cleora_python import SparseMatrix
+from pycleora import SparseMatrix
 
 start_time = time.time()
 
diff --git a/examples/graph_pickle.py b/examples/graph_pickle.py
@@ -1,7 +1,7 @@
 import time
 
 import numpy as np
-from cleora_python import SparseMatrix
+from pycleora import SparseMatrix
 
 import pickle
 
diff --git a/examples/predefined_cleora_loop.py b/examples/predefined_cleora_loop.py
@@ -1,6 +1,6 @@
 import time
 
-from cleora_python import embed_using_baseline_cleora, SparseMatrix
+from pycleora import embed_using_baseline_cleora, SparseMatrix
 
 start_time = time.time()
 graph = SparseMatrix.from_files(["perf_inputs/0.tsv", "perf_inputs/1.tsv", "perf_inputs/2.tsv", "perf_inputs/3.tsv", "perf_inputs/4.tsv", "perf_inputs/5.tsv", "perf_inputs/6.tsv", "perf_inputs/7.tsv"], "complex::reflexive::name")
diff --git a/pycleora/.gitignore b/pycleora/.gitignore
diff --git a/pycleora/__init__.py b/pycleora/__init__.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from .cleora import SparseMatrix
+from .pycleora import SparseMatrix
 
 def embed_using_baseline_cleora(graph, feature_dim: int, iter: int):
     embeddings = graph.initialize_deterministically(feature_dim)
diff --git a/pycleora/cleora_python.cpython-39-x86_64-linux-gnu.so b/pycleora/cleora_python.cpython-39-x86_64-linux-gnu.so
diff --git a/pycleora/pycleora.cpython-39-x86_64-linux-gnu.so b/pycleora/pycleora.cpython-39-x86_64-linux-gnu.so
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,13 +3,20 @@ requires = ["maturin>=1.2.3"]
 build-backend = "maturin"
 
 [project]
-name = "cleora_python"
+name = "pycleora"
 requires-python = ">=3.7"
 classifiers = [
     "Programming Language :: Rust",
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
 ]
+version = "2.0.0"
+description = "Sparse hypergraph structure and markov-propagation for node embeddings embeddings exposed via Python bindings."
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "Jacek Dabrowski", email = "jack.dabrowski@synerise.com" }
+]
+license = { file = "LICENSE" }
 
 
 [tool.maturin]
diff --git a/src/lib.rs b/src/lib.rs
@@ -234,7 +234,7 @@ fn init_value(col: usize, hsh: u64, fixed_random_value: i64) -> f32 {
 }
 
 #[pymodule]
-#[pyo3(name = "cleora")]
+#[pyo3(name = "pycleora")]
 fn pycleora(_py: Python, m: &PyModule) -> PyResult<()> {
     m.add_class::<SparseMatrix>()?;
     Ok(())
diff --git a/src/sparse_matrix_builder.rs b/src/sparse_matrix_builder.rs
@@ -263,7 +263,7 @@ impl SparseMatrixBuffer {
 
     fn update_row(&mut self, hash: u64, count: u32) {
         let val = 1f32 / (count as f32);
-        let mut e = self.hash_2_row.entry(hash).or_default();
+        let e = self.hash_2_row.entry(hash).or_default();
         e.occurrence += count;
         e.row_sum += val
     }