Add tests

ahmedfgad · ahmedfgad · commit a1ab6aaeaff1 · 2023-04-20T09:48:26.000-04:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,3 +51,12 @@ dependencies = [
 
 [project.optional-dependencies]
 deep_learning = ["keras", "torch"]
+
+# PyTest Configuration. Later, PyTest will support the [tool.pytest] table.
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = [
+    "tests",
+    "integration",
+]
diff --git a/tests/example_clustering_2.py b/tests/example_clustering_2.py
@@ -0,0 +1,122 @@
+import numpy
+import matplotlib.pyplot
+import pygad
+
+cluster1_num_samples = 10
+cluster1_x1_start = 0
+cluster1_x1_end = 5
+cluster1_x2_start = 2
+cluster1_x2_end = 6
+cluster1_x1 = numpy.random.random(size=(cluster1_num_samples))
+cluster1_x1 = cluster1_x1 * (cluster1_x1_end - cluster1_x1_start) + cluster1_x1_start
+cluster1_x2 = numpy.random.random(size=(cluster1_num_samples))
+cluster1_x2 = cluster1_x2 * (cluster1_x2_end - cluster1_x2_start) + cluster1_x2_start
+
+cluster2_num_samples = 10
+cluster2_x1_start = 10
+cluster2_x1_end = 15
+cluster2_x2_start = 8
+cluster2_x2_end = 12
+cluster2_x1 = numpy.random.random(size=(cluster2_num_samples))
+cluster2_x1 = cluster2_x1 * (cluster2_x1_end - cluster2_x1_start) + cluster2_x1_start
+cluster2_x2 = numpy.random.random(size=(cluster2_num_samples))
+cluster2_x2 = cluster2_x2 * (cluster2_x2_end - cluster2_x2_start) + cluster2_x2_start
+
+c1 = numpy.array([cluster1_x1, cluster1_x2]).T
+c2 = numpy.array([cluster2_x1, cluster2_x2]).T
+
+data = numpy.concatenate((c1, c2), axis=0)
+
+matplotlib.pyplot.scatter(cluster1_x1, cluster1_x2)
+matplotlib.pyplot.scatter(cluster2_x1, cluster2_x2)
+matplotlib.pyplot.title("Optimal Clustering")
+matplotlib.pyplot.show()
+
+def euclidean_distance(X, Y):
+    """
+    Calculate the euclidean distance between X and Y. It accepts:
+    :X should be a matrix of size (N, f) where N is the number of samples and f is the number of features for each sample.
+    :Y should be of size f. In other words, it is a single sample.
+    
+    Returns a vector of N elements with the distances between the N samples and the Y.
+    """
+
+    return numpy.sqrt(numpy.sum(numpy.power(X - Y, 2), axis=1))
+
+def cluster_data(solution, solution_idx):
+    """
+    Clusters the data based on the current solution.
+    """
+
+    global num_cluster, data
+    feature_vector_length = data.shape[1]
+    cluster_centers = [] # A list of size (C, f) where C is the number of clusters and f is the number of features representing each sample.
+    all_clusters_dists = [] # A list of size (C, N) where C is the number of clusters and N is the number of data samples. It holds the distances between each cluster center and all the data samples.
+    clusters = [] # A list with C elements where each element holds the indices of the samples within a cluster.
+    clusters_sum_dist = [] # A list with C elements where each element represents the sum of distances of the samples with a cluster.
+
+    for clust_idx in range(num_clusters):
+        # Return the current cluster center.
+        cluster_centers.append(solution[feature_vector_length*clust_idx:feature_vector_length*(clust_idx+1)])
+        # Calculate the distance (e.g. euclidean) between the current cluster center and all samples.
+        cluster_center_dists = euclidean_distance(data, cluster_centers[clust_idx])
+        all_clusters_dists.append(numpy.array(cluster_center_dists))
+
+    cluster_centers = numpy.array(cluster_centers)
+    all_clusters_dists = numpy.array(all_clusters_dists)
+
+    # A 1D array that, for each sample, holds the index of the cluster with the smallest distance. 
+    # In other words, the array holds the sample's cluster index.
+    cluster_indices = numpy.argmin(all_clusters_dists, axis=0)
+    for clust_idx in range(num_clusters):
+        clusters.append(numpy.where(cluster_indices == clust_idx)[0])
+        # Calculate the sum of distances for the cluster.
+        if len(clusters[clust_idx]) == 0:
+            # In case the cluster is empty (i.e. has zero samples).
+            clusters_sum_dist.append(0)
+        else:
+            # When the cluster is not empty (i.e. has at least 1 sample).
+            clusters_sum_dist.append(numpy.sum(all_clusters_dists[clust_idx, clusters[clust_idx]]))
+            # clusters_sum_dist.append(numpy.sum(euclidean_distance(data[clusters[clust_idx], :], cluster_centers[clust_idx])))
+
+    clusters_sum_dist = numpy.array(clusters_sum_dist)
+
+    return cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist
+
+def fitness_func(ga_instance, solution, solution_idx):
+    _, _, _, _, clusters_sum_dist = cluster_data(solution, solution_idx)
+
+    # The tiny value 0.00000001 is added to the denominator in case the average distance is 0.
+    fitness = 1.0 / (numpy.sum(clusters_sum_dist) + 0.00000001)
+
+    return fitness
+
+num_clusters = 2
+num_genes = num_clusters * data.shape[1]
+
+ga_instance = pygad.GA(num_generations=100,
+                       sol_per_pop=10,
+                       num_parents_mating=5,
+                       init_range_low=-6,
+                       init_range_high=20,
+                       keep_parents=2,
+                       num_genes=num_genes,
+                       fitness_func=fitness_func,
+                       suppress_warnings=True)
+
+ga_instance.run()
+
+best_solution, best_solution_fitness, best_solution_idx = ga_instance.best_solution()
+print("Best solution is {bs}".format(bs=best_solution))
+print("Fitness of the best solution is {bsf}".format(bsf=best_solution_fitness))
+print("Best solution found after {gen} generations".format(gen=ga_instance.best_solution_generation))
+
+cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist = cluster_data(best_solution, best_solution_idx)
+
+for cluster_idx in range(num_clusters):
+    cluster_x = data[clusters[cluster_idx], 0]
+    cluster_y = data[clusters[cluster_idx], 1]
+    matplotlib.pyplot.scatter(cluster_x, cluster_y)
+    matplotlib.pyplot.scatter(cluster_centers[cluster_idx, 0], cluster_centers[cluster_idx, 1], linewidths=5)
+matplotlib.pyplot.title("Clustering using PyGAD")
+matplotlib.pyplot.show()
diff --git a/tests/example_clustering_3.py b/tests/example_clustering_3.py
@@ -0,0 +1,134 @@
+import numpy
+import matplotlib.pyplot
+import pygad
+
+cluster1_num_samples = 20
+cluster1_x1_start = 0
+cluster1_x1_end = 5
+cluster1_x2_start = 2
+cluster1_x2_end = 6
+cluster1_x1 = numpy.random.random(size=(cluster1_num_samples))
+cluster1_x1 = cluster1_x1 * (cluster1_x1_end - cluster1_x1_start) + cluster1_x1_start
+cluster1_x2 = numpy.random.random(size=(cluster1_num_samples))
+cluster1_x2 = cluster1_x2 * (cluster1_x2_end - cluster1_x2_start) + cluster1_x2_start
+
+cluster2_num_samples = 20
+cluster2_x1_start = 4
+cluster2_x1_end = 12
+cluster2_x2_start = 14
+cluster2_x2_end = 18
+cluster2_x1 = numpy.random.random(size=(cluster2_num_samples))
+cluster2_x1 = cluster2_x1 * (cluster2_x1_end - cluster2_x1_start) + cluster2_x1_start
+cluster2_x2 = numpy.random.random(size=(cluster2_num_samples))
+cluster2_x2 = cluster2_x2 * (cluster2_x2_end - cluster2_x2_start) + cluster2_x2_start
+
+cluster3_num_samples = 20
+cluster3_x1_start = 12
+cluster3_x1_end = 18
+cluster3_x2_start = 8
+cluster3_x2_end = 11
+cluster3_x1 = numpy.random.random(size=(cluster3_num_samples))
+cluster3_x1 = cluster3_x1 * (cluster3_x1_end - cluster3_x1_start) + cluster3_x1_start
+cluster3_x2 = numpy.random.random(size=(cluster3_num_samples))
+cluster3_x2 = cluster3_x2 * (cluster3_x2_end - cluster3_x2_start) + cluster3_x2_start
+
+c1 = numpy.array([cluster1_x1, cluster1_x2]).T
+c2 = numpy.array([cluster2_x1, cluster2_x2]).T
+c3 = numpy.array([cluster3_x1, cluster3_x2]).T
+
+data = numpy.concatenate((c1, c2, c3), axis=0)
+
+matplotlib.pyplot.scatter(cluster1_x1, cluster1_x2)
+matplotlib.pyplot.scatter(cluster2_x1, cluster2_x2)
+matplotlib.pyplot.scatter(cluster3_x1, cluster3_x2)
+matplotlib.pyplot.title("Optimal Clustering")
+matplotlib.pyplot.show()
+
+def euclidean_distance(X, Y):
+    """
+    Calculate the euclidean distance between X and Y. It accepts:
+    :X should be a matrix of size (N, f) where N is the number of samples and f is the number of features for each sample.
+    :Y should be of size f. In other words, it is a single sample.
+    
+    Returns a vector of N elements with the distances between the N samples and the Y.
+    """
+
+    return numpy.sqrt(numpy.sum(numpy.power(X - Y, 2), axis=1))
+
+def cluster_data(solution, solution_idx):
+    """
+    Clusters the data based on the current solution.
+    """
+
+    global num_clusters, feature_vector_length, data
+    cluster_centers = [] # A list of size (C, f) where C is the number of clusters and f is the number of features representing each sample.
+    all_clusters_dists = [] # A list of size (C, N) where C is the number of clusters and N is the number of data samples. It holds the distances between each cluster center and all the data samples.
+    clusters = [] # A list with C elements where each element holds the indices of the samples within a cluster.
+    clusters_sum_dist = [] # A list with C elements where each element represents the sum of distances of the samples with a cluster.
+
+    for clust_idx in range(num_clusters):
+        # Return the current cluster center.
+        cluster_centers.append(solution[feature_vector_length*clust_idx:feature_vector_length*(clust_idx+1)])
+        # Calculate the distance (e.g. euclidean) between the current cluster center and all samples.
+        cluster_center_dists = euclidean_distance(data, cluster_centers[clust_idx])
+        all_clusters_dists.append(numpy.array(cluster_center_dists))
+
+    cluster_centers = numpy.array(cluster_centers)
+    all_clusters_dists = numpy.array(all_clusters_dists)
+
+    # A 1D array that, for each sample, holds the index of the cluster with the smallest distance. 
+    # In other words, the array holds the sample's cluster index.
+    cluster_indices = numpy.argmin(all_clusters_dists, axis=0)
+    for clust_idx in range(num_clusters):
+        clusters.append(numpy.where(cluster_indices == clust_idx)[0])
+        # Calculate the sum of distances for the cluster.
+        if len(clusters[clust_idx]) == 0:
+            # In case the cluster is empty (i.e. has zero samples).
+            clusters_sum_dist.append(0)
+        else:
+            # When the cluster is not empty (i.e. has at least 1 sample).
+            clusters_sum_dist.append(numpy.sum(all_clusters_dists[clust_idx, clusters[clust_idx]]))
+            # clusters_sum_dist.append(numpy.sum(euclidean_distance(data[clusters[clust_idx], :], cluster_centers[clust_idx])))
+
+    clusters_sum_dist = numpy.array(clusters_sum_dist)
+
+    return cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist
+
+def fitness_func(ga_instance, solution, solution_idx):
+    _, _, _, _, clusters_sum_dist = cluster_data(solution, solution_idx)
+
+    # The tiny value 0.00000001 is added to the denominator in case the average distance is 0.
+    fitness = 1.0 / (numpy.sum(clusters_sum_dist) + 0.00000001)
+
+    return fitness
+
+num_clusters = 3
+feature_vector_length = data.shape[1]
+num_genes = num_clusters * feature_vector_length
+
+ga_instance = pygad.GA(num_generations=100,
+                       sol_per_pop=10,
+                       init_range_low=0,
+                       init_range_high=20,
+                       num_parents_mating=5,
+                       keep_parents=2,
+                       num_genes=num_genes,
+                       fitness_func=fitness_func,
+                       suppress_warnings=True)
+
+ga_instance.run()
+
+best_solution, best_solution_fitness, best_solution_idx = ga_instance.best_solution()
+print("Best solution is {bs}".format(bs=best_solution))
+print("Fitness of the best solution is {bsf}".format(bsf=best_solution_fitness))
+print("Best solution found after {gen} generations".format(gen=ga_instance.best_solution_generation))
+
+cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist = cluster_data(best_solution, best_solution_idx)
+
+for cluster_idx in range(num_clusters):
+    cluster_x = data[clusters[cluster_idx], 0]
+    cluster_y = data[clusters[cluster_idx], 1]
+    matplotlib.pyplot.scatter(cluster_x, cluster_y)
+    matplotlib.pyplot.scatter(cluster_centers[cluster_idx, 0], cluster_centers[cluster_idx, 1], linewidths=5)
+matplotlib.pyplot.title("Clustering using PyGAD")
+matplotlib.pyplot.show()
diff --git a/tests/example_custom_operators.py b/tests/example_custom_operators.py
@@ -0,0 +1,74 @@
+import pygad
+import numpy
+
+"""
+This script gives an example of using custom user-defined functions for the 3 operators:
+    1) Parent selection.
+    2) Crossover.
+    3) Mutation.
+For more information, check the User-Defined Crossover, Mutation, and Parent Selection Operators section in the documentation:
+    https://pygad.readthedocs.io/en/latest/README_pygad_ReadTheDocs.html#user-defined-crossover-mutation-and-parent-selection-operators
+"""
+
+equation_inputs = [4,-2,3.5]
+desired_output = 44
+
+def fitness_func(ga_instance, solution, solution_idx):
+    output = numpy.sum(solution * equation_inputs)
+
+    fitness = 1.0 / (numpy.abs(output - desired_output) + 0.000001)
+
+    return fitness
+
+def parent_selection_func(fitness, num_parents, ga_instance):
+    # Selects the best {num_parents} parents. Works as steady-state selection.
+
+    fitness_sorted = sorted(range(len(fitness)), key=lambda k: fitness[k])
+    fitness_sorted.reverse()
+
+    parents = numpy.empty((num_parents, ga_instance.population.shape[1]))
+
+    for parent_num in range(num_parents):
+        parents[parent_num, :] = ga_instance.population[fitness_sorted[parent_num], :].copy()
+
+    return parents, numpy.array(fitness_sorted[:num_parents])
+
+def crossover_func(parents, offspring_size, ga_instance):
+    # This is single-point crossover.
+    offspring = []
+    idx = 0
+    while len(offspring) != offspring_size[0]:
+        parent1 = parents[idx % parents.shape[0], :].copy()
+        parent2 = parents[(idx + 1) % parents.shape[0], :].copy()
+
+        random_split_point = numpy.random.choice(range(offspring_size[0]))
+
+        parent1[random_split_point:] = parent2[random_split_point:]
+
+        offspring.append(parent1)
+
+        idx += 1
+
+    return numpy.array(offspring)
+
+def mutation_func(offspring, ga_instance):
+    # This is random mutation that mutates a single gene.
+    for chromosome_idx in range(offspring.shape[0]):
+        # Make some random changes in 1 or more genes.
+        random_gene_idx = numpy.random.choice(range(offspring.shape[1]))
+
+        offspring[chromosome_idx, random_gene_idx] += numpy.random.random()
+
+    return offspring
+
+ga_instance = pygad.GA(num_generations=10,
+                       sol_per_pop=5,
+                       num_parents_mating=2,
+                       num_genes=len(equation_inputs),
+                       fitness_func=fitness_func,
+                       parent_selection_type=parent_selection_func,
+                       crossover_type=crossover_func,
+                       mutation_type=mutation_func)
+
+ga_instance.run()
+ga_instance.plot_fitness()
diff --git a/tests/example_logger.py b/tests/example_logger.py
@@ -0,0 +1,45 @@
+import logging
+import pygad
+import numpy
+
+level = logging.DEBUG
+name = 'logfile.txt'
+
+logger = logging.getLogger(name)
+logger.setLevel(level)
+
+file_handler = logging.FileHandler(name,'a+','utf-8')
+file_handler.setLevel(logging.DEBUG)
+file_format = logging.Formatter('%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+file_handler.setFormatter(file_format)
+logger.addHandler(file_handler)
+
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+console_format = logging.Formatter('%(message)s')
+console_handler.setFormatter(console_format)
+logger.addHandler(console_handler)
+
+equation_inputs = [4, -2, 8]
+desired_output = 2671.1234
+
+def fitness_func(ga_instance, solution, solution_idx):
+    output = numpy.sum(solution * equation_inputs)
+    fitness = 1.0 / (numpy.abs(output - desired_output) + 0.000001)
+    return fitness
+
+def on_generation(ga_instance):
+    ga_instance.logger.info("Generation = {generation}".format(generation=ga_instance.generations_completed))
+    ga_instance.logger.info("Fitness    = {fitness}".format(fitness=ga_instance.best_solution(pop_fitness=ga_instance.last_generation_fitness)[1]))
+
+ga_instance = pygad.GA(num_generations=10,
+                       sol_per_pop=40,
+                       num_parents_mating=2,
+                       keep_parents=2,
+                       num_genes=len(equation_inputs),
+                       fitness_func=fitness_func,
+                       on_generation=on_generation,
+                       logger=logger)
+ga_instance.run()
+
+logger.handlers.clear()
diff --git a/tests/lifecycle.py b/tests/lifecycle.py
diff --git a/tests/test_example.py b/tests/test_example.py