Benchmarking

Notebook to organise benchmarks of different implementations of the common-nearest-neighbour clustering and other cluster algorithms:

  • DBSCAN (sklearn.cluster.DBSCAN)

  • HDBSCAN ()

  • OPTICS (sklearn.cluster.OPTICS)

  • Density peaks ()

  • Jarvis-Patrick ()

  • Common-nearest-neighbours (cnnclustering.cnn)

  • Common-nearest-neighbours (sklearn_extra.cluster.CommonNNClustering)

Table of Contents

  • 1  Pre-requirements

  • 2  Version info

  • 3  Helper function definitions

    • 3.1  Plots

    • 3.2  Test data set generation

    • 3.3  Benchmark organisation

    • 3.4  Profiling

  • 4  Consitency check

    • 4.1  scikit-learn DBSCAN

    • 4.2  scikit-learn-extra CommonNNClustering

    • 4.3  cnnclustering CNN from points on-the-fly

    • 4.4  cnnclustering CNN from points bulk

    • 4.5  cnnclustering CNN from distances on-the-fly

    • 4.6  cnnclustering CNN from distances bulk

    • 4.7  scikit-learn OPTICS (DBSCAN)

    • 4.8  scikit-learn OPTICS (XI)

  • 5  Timings

    • 5.1  Blobs set

  • 6  Fit variants

    • 6.1  From neighbours

      • 6.1.1  From list of sets

        • 6.1.1.1  Baseline

        • 6.1.1.2  Stdlib index

        • 6.1.1.3  Stdlib cython

      • 6.1.2  From numpy.array

      • 6.1.3  Check in CNN class context

    • 6.2  From density graph

      • 6.2.1  From SparsegraphArray

Pre-requirements

[1]:
from collections import Counter
import importlib
from operator import itemgetter
import sys
import time

from IPython.core.display import display, HTML
import numpy as np

%matplotlib widget
import matplotlib as mpl
import matplotlib.pyplot as plt

import hdbscan
import sklearn
import sklearn_extra
from sklearn import cluster as skcluster
from sklearn_extra import cluster as skextracluster
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

import cnnclustering
from cnnclustering import _cfits  # Cythonised fit implementation
from cnnclustering import _fits   # Python fit implementation
from cnnclustering import cnn

# Jupyter extensions
%load_ext Cython
%load_ext line_profiler
%load_ext memory_profiler
[2]:
# Matplotlib configuration
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rc_file("../tutorial/matplotlibrc", use_default_template=False)
[3]:
# Jupyter notebook configuration
display(HTML("<style>.container { width:85% !important; }</style>"))

Version info

[4]:
print(f"{'Python':>20} :  ", *sys.version.splitlines())

modules = [
    ('scikit-learn', sklearn),
    ('scikit-learn-extra', sklearn_extra),
    ('cnnclustering', cnnclustering),
    ('hdbscan', hdbscan),
]

for alias, m in modules:
    try:
        print(f"{alias:>20} :  ", m.__version__)
    except AttributeError:
        print(f"{alias:>20} :  ", "no version info")
              Python :   3.8.3 (default, May 15 2020, 15:24:35)  [GCC 8.3.0]
        scikit-learn :   0.24.dev0
  scikit-learn-extra :   0.1.0b2
       cnnclustering :   0.3.9
             hdbscan :   no version info

Helper function definitions

Plots

[5]:
# Axis property defaults for the plots
ax_props = {
    "xlabel": None,
    "ylabel": None,
    "xlim": (-2.5, 2.5),
    "ylim": (-2.5, 2.5),
    "xticks": (),
    "yticks": (),
    "aspect": "equal"
}
[6]:
def plot_data(data, labels=None, ax=None, noise=0):
    """Take a data set and cluster labels to make a basic 2D dot plot

    Args:
        data: Numpy `numpy.ndarray` of shape (#points, 2) with x, y
            coordinates of points in 2D
        labels: Numpy `numpy.ndarray` of shape (#points,) and
            `dtype = int` holding cluster label assignments for all
            points.  If `None`, will plot the data set without
            point colouring by label.
        ax: Matplotlib `matplotlib.axes.SubplotBase` instance to
            attach the plot to.  If `None`, wil create a new instance.
        noise: Integer label used to mark point as noise (no cluster
            assignment; Usually 0 or -1).
    """

    if ax is None:
        plt.close('all')
        fig, ax = plt.subplots(
            figsize=(
                mpl.rcParams["figure.figsize"][0] / 2,
                mpl.rcParams["figure.figsize"][1]
                )
            )
    else:
        fig = ax.get_figure()

    if labels is None:
        ax.plot(
            *data.T,
            linestyle="",
            color="None",
            marker="o",
            markersize=4,
            markerfacecolor="white",
            markeredgecolor="k",
            )

    else:
        ax.plot(
            *data[np.where(labels == noise)[0]].T,
            linestyle="",
            color="None",
            marker="o",
            markersize=4,
            markerfacecolor="gray",
            markeredgecolor="k",
            )

        for cluster_number in range(noise + 1 , int(np.max(labels)) + 1):
            ax.plot(
                *data[np.where(labels == cluster_number)[0]].T,
                linestyle="",
                marker="o",
                markersize=4,
                markeredgecolor="k",
                )

    ax.set(**{
        "xticks": (),
        "yticks": (),
        "xticklabels": (),
        "yticklabels": (),
        "aspect": "equal"
        })

Test data set generation

Data set generation functions should be generally designed in such a way that they expect exactly one argument n and return a 2D data set of n sample points. A label attribute can be optionally added to the function object for identification.

[7]:
# Global seed for data set generation functions
np.random.seed(42)
[8]:
# circles
def gen_circles(n):
    circles, _ = datasets.make_circles(
        n_samples=n,
        factor=.5,
        noise=.05,
        random_state=10
        )

    return StandardScaler().fit_transform(circles)

gen_circles.label = "circles"

# blobs
def gen_blobs(n):
    blobs, _ = datasets.make_blobs(
        centers=[[-10, -10], [10, -10], [10, 10]],
        n_samples=n,
        random_state=10
    )
    return StandardScaler().fit_transform(blobs)

gen_blobs.label = "blobs"

# moons
def gen_moons(n):
    moons, _ = datasets.make_moons(
        n_samples=n,
        noise=.05,
        random_state=10
        )

    return StandardScaler().fit_transform(moons)

gen_moons.label = "moons"

def gen_no_structure(n):
    no_structure = np.random.rand(n, 2)

    return StandardScaler().fit_transform(no_structure)

gen_no_structure.label = "None"

def gen_aniso(n):
    X, y = datasets.make_blobs(
        n_samples=n,
        random_state=170
        )

    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    aniso = np.dot(X, transformation)

    return StandardScaler().fit_transform(aniso)

gen_aniso.label = "aniso"

def gen_varied(n):
    varied, _ = datasets.make_blobs(
        n_samples=n,
        cluster_std=[1.0, 2.5, 0.5],
        random_state=170)

    return StandardScaler().fit_transform(varied)

gen_varied.label = "varied"

Benchmark organisation

[23]:
class DS:
    """Benchmark class to represent a data set

    Initialise a `DS` instance with a data set generation function.

    Attributes:
       points: A (2D) sample data set
       prepare_neighbourhoods: Pre-compute neighbourhoods
    """

    def __init__(
            self, gen_fxn, n, gen_fxn_args=None, gen_fxn_kwargs=None):
        self.gen_fxn = gen_fxn
        self.n = n

        if gen_fxn_kwargs is None:
            gen_fxn_kwargs = {}
        if gen_fxn_args is None:
            gen_fxn_args = ()

        self.points = gen_fxn(n, *gen_fxn_args, **gen_fxn_kwargs)
        self.neighbourhoods = None  # Pre-computed neighbourhoods
        self.r = None               # Neighbourhood computation radius
        self.timeits = {}           # Timing results

    def prepare_neighbourhoods(self, r, numpy=False):
        """Pre-compute neighbourhoods at a given readius `r`

        Uses `sklearn.neighbors.NearestNeighbors`.

        Args:
            numpy: If `True`, returns a `numpy.ndarray` of shape
            (#points,) containing the neighbours of each points as
            `numpy.ndarray` of shape (#neighbours,).  If `False`,
            returns a `list` of `sets` instead.
        """

        neighbour_model = NearestNeighbors(radius=r).fit(self.points)
        neighbours = neighbour_model.radius_neighbors(
            self.points, return_distance=False
            )

        # Remove self-counting
        neighbours = [set(x) for x in neighbours]
        for c, s in enumerate(neighbours):
            s.remove(c)
        if numpy:
            neighbours = np.array([np.array([y for y in x]) for x in neighbours])

        self.neighbourhoods = neighbours
        self.r = r

    def __str__(self):
        try:
            desc_gen = self.gen_fxn.label
        except AttributeError:
            desc_gen = self.gen_fxn.__name__

        # p: from points
        # d: from distances
        # n: from neighbours

        if self.neighbourhoods is not None:
            desc_from = f"n{self.r}"
        elif self.dist is not None:
            desc_from = f"d"
        else:
            desc_from = f"p"

        return f"{desc_gen}_{self.n}_{desc_from}"


    def ratios(self, base=None):
        """Show relative performance of runs based on `timeits` dict"""

        if base is not None:
            base = self.timeits[base].average
        else:
            base = min(x.average for x in self.timeits.values())

        return sorted([
            (k, v.average / base)
            for k, v in self.timeits.items()
            ], key=itemgetter(1))
[24]:
# Benchmark fixture 1
def prepare_neighbours(n, r, gen_fxn, numpy=False):
    """Provide pre-computed neighbourhoods

    Uses `sklearn.neighbors.NearestNeighbors`. Removes self-counting
    of points as neighbours of themselves.

    Args:
       n: Number of data points
       r: Radius
       gen_fxn: Function that accepts one parameter `n` and returns
           a data set with `n` points, for which neighbourhoods will
           be computed.
       numpy: If `True`, provide neighbourhoods as 1D `numpy.array` of 1D
           `numpy.array`s.  If `False`, convert to `list` of `set`s.
    """

    data = gen_fxn(n)
    neighbour_model = NearestNeighbors(radius=r).fit(data)
    neighbours = neighbour_model.radius_neighbors(data, return_distance=False)
    neighbours = [set(x) for x in neighbours]
    for c, s in enumerate(neighbours):
        # Remove self-counting
        s.remove(c)
    if numpy:
        # Reconvert to numpy.array
        neighbours = np.array([np.array([y for y in x]) for x in neighbours])

    return neighbours

Profiling

[25]:
def profile_fxn(
        f, ds, report_dir, *args,
        t=True, l=True, label=None, validate=True, **kwargs):
    """Function profiling procedure

    Runs %lprun and %timeit line magic on a globally defined function
    `fxn`.  Function args and kwargs need to be defined globally as
    well.  This is necessary, because (at least lprun) line magic does
    not seem to work well with local variables, e.g. the following
    alternative did not work (raises `NameError`):

       def profile_fxn(fxn, *args, **kwargs):
           %lprun -f fxn fxn(*args, **kwargs)
           ...

    This function expects a :obj:`DS` object, providing a dataset and
    pre-calculated values if necessary.  Report details are deduced from
    this object.  Timings are saved to the object.

    Args:
       f (:obj:`func`): Function to profile.
       ds (:obj:`DS`): Data set object.
       report_dir (str): Output directory file path.
       *args: Arguments passed to `f`

    Keyword args:
       t (bool): If `True`, time function call with timeit line magic
       l (bool): If `True`, line profile function call with lpro line magic
       label (optional, str): Label to identify the run.  If `None`,
          `fxn.__name__` is used.
       validate (bool): If True, execute function call once to evaluate
          the result before the benchmark

       **kwargs: Keyword arguments passed to `f`.

    Returns:
       None
    """

    global fxn
    global fxn_args
    global fxn_kwargs

    fxn = f
    fxn_args = args
    fxn_kwargs = kwargs

    if validate:
        # Validate function result (experimental)
        result = fxn(*fxn_args, **fxn_kwargs)

        # Convert result if not labels array
        if isinstance(result, list) and isinstance(result[0], np.ndarray):
            # Convert result if from original implementation (baseline)
            result = baseline_to_labels(result)

        if result is not None:
            noise = 0
            frequencies = Counter(result)
            if 0 in frequencies:
                noise = frequencies.pop(0)

            largest = frequencies.most_common(1)[0][1] if frequencies else 0
            clusters = len(frequencies)

            print(f"Length of labels:    {len(result)}")
            print(f"Noise:               {noise}")
            print(f"Largest:             {largest}")
            print(f"Clusters:            {clusters}")

    # Profile
    if l:
        %lprun -T {report_dir}/{fxn.__name__}_{ds.__str__()}.lprun -f fxn fxn(*fxn_args, **fxn_kwargs)
    if t:
        o = %timeit -q -o fxn(*fxn_args, **fxn_kwargs)

        if label is None:
            label = fxn.__name__
        ds.timeits.update({label: o})
[26]:
def baseline_to_labels(result):
    """Convert result from original implementation (baseline)"""

    len_ = len(result)
    result = [x for x in result if isinstance(x, np.ndarray)]
    result_ = np.zeros(len_)
    for c, cluster in enumerate(result, 1):
        for member in cluster:
            result_[member] = c
    result = result_

    return result
[53]:
def sub_ds_args(args, ds):
    args = list(args)
    for i, arg in enumerate(args):
        if not isinstance(arg, str):
            continue
        if arg.startswith("DS_ATTR:"):
            attr = arg.split(":")[-1]
            args[i] = getattr(ds, attr)
    return tuple(args)
[57]:
def time_runs(
        signatures, gen_fxn,
        timings=None, samples=None, v=True):

    if timings is None:
        timings = {}

    if samples is None:
        samples = []

    for n in samples:
        ds = DS(gen_fxn, n)
        for label, f, args, KWARGS in signatures:
            args = sub_ds_args(args, ds)
            kwargs = KWARGS.get(n, KWARGS.get("default", {}))

            profile_fxn(
                f, ds, "/dev/null",
                *args,           # function args
                l=False,
                label=label,
                validate=False,  # function kwargs
                **kwargs
                )

        timings[n] = ds.timeits

        if v:
            print("-" * 80)
            print(n)
            for x in ds.ratios():
                print(f"{x[0]:>15}: {x[1]:7.3f}")

    return timings

Consitency check

Ensure that every implementation delivers a consistent cluster result for test data sets.

[29]:
# Plot the test data sets
plt.close("all")
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

generation_fxns = [
    gen_circles, gen_moons, gen_varied,
    gen_aniso, gen_blobs, gen_no_structure,
    ]

for count, gen_fxn in enumerate(generation_fxns):
    # Plot
    plot_data(gen_fxn(5000), ax=Ax[count])
    Ax[count].set(**ax_props)

    try:
        name = gen_fxn.label
    except AttributeError:
        name = gen_fxn.__name__

    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0.05, top=0.9, wspace=0, hspace=0.2
    )

plt.show()

scikit-learn DBSCAN

[15]:
# Plot the test data sets
plt.close("all")
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

generation_fxns = [
    gen_circles, gen_moons, gen_varied,
    gen_aniso, gen_blobs, gen_no_structure,
    ]

fit_params = [
    {"eps": 0.2, "min_samples": 5},    # circles
    {"eps": 0.2, "min_samples": 5},    # moons
    {"eps": 0.14, "min_samples": 20},  # varied
    {"eps": 0.11, "min_samples": 20},  # aniso
    {"eps": 0.2, "min_samples": 5},    # blobs
    {"eps": 0.2, "min_samples": 5},    # no structure
    ]

for count, (gen_fxn, params) in enumerate(zip(generation_fxns, fit_params)):
    # Fit
    data = gen_fxn(5000)
    labels = skcluster.dbscan(data, **params)[1]

    # Plot
    plot_data(data, labels=labels, ax=Ax[count], noise=-1)
    Ax[count].set(**ax_props)

    try:
        name = gen_fxn.label
    except AttributeError:
        name = gen_fxn.__name__

    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0.05, top=0.9, wspace=0, hspace=0.2
    )

plt.show()

scikit-learn-extra CommonNNClustering

[18]:
# Plot the test data sets
plt.close("all")
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

generation_fxns = [
    gen_circles, gen_moons, gen_varied,
    gen_aniso, gen_blobs, gen_no_structure,
    ]

fit_params = [
    {"eps": 0.2, "min_samples": 5},    # circles
    {"eps": 0.2, "min_samples": 5},    # moons
    {"eps": 0.18, "min_samples": 20},  # varied
    {"eps": 0.15, "min_samples": 10},  # aniso
    {"eps": 0.2, "min_samples": 5},    # blobs
    {"eps": 0.2, "min_samples": 5},    # no structure
    ]

for count, (gen_fxn, params) in enumerate(zip(generation_fxns, fit_params)):
    # Fit
    data = gen_fxn(5000)
    labels = skextracluster.commonnn(data, **params)

    # Plot
    plot_data(data, labels=labels, ax=Ax[count], noise=-1)
    Ax[count].set(**ax_props)

    try:
        name = gen_fxn.label
    except AttributeError:
        name = gen_fxn.__name__

    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0.05, top=0.9, wspace=0, hspace=0.2
    )

plt.show()

cnnclustering CNN from points on-the-fly

[19]:
# Plot the test data sets
plt.close("all")
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

generation_fxns = [
    gen_circles, gen_moons, gen_varied,
    gen_aniso, gen_blobs, gen_no_structure,
    ]

fit_params = [
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # circles
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # moons
    {"radius_cutoff": 0.18, "cnn_cutoff": 20, "member_cutoff": 8},  # varied
    {"radius_cutoff": 0.15, "cnn_cutoff": 10},                      # aniso
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # blobs
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # no structure
    ]

for count, (gen_fxn, params) in enumerate(zip(generation_fxns, fit_params)):
    # Fit
    data = gen_fxn(5000)
    clustering = cnn.CNN(data)
    clustering.fit(**params, rec=True, policy="conservative")

    # Plot
    plot_data(data, labels=clustering.labels, ax=Ax[count], noise=0)
    Ax[count].set(**ax_props)

    try:
        name = gen_fxn.label
    except AttributeError:
        name = gen_fxn.__name__

    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0.05, top=0.9, wspace=0, hspace=0.2
    )

plt.show()
Execution time for call of fit: 0 hours, 0 minutes, 1.4432 seconds
--------------------------------------------------------------------------------
#points   R         C         min       max       #clusters %largest  %noise
5000      0.200     5         2         None      2         0.500     0.001
--------------------------------------------------------------------------------
Execution time for call of fit: 0 hours, 0 minutes, 1.5315 seconds
--------------------------------------------------------------------------------
#points   R         C         min       max       #clusters %largest  %noise
5000      0.200     5         2         None      2         0.500     0.000
--------------------------------------------------------------------------------
Execution time for call of fit: 0 hours, 0 minutes, 2.2066 seconds
--------------------------------------------------------------------------------
#points   R         C         min       max       #clusters %largest  %noise
5000      0.180     20        8         None      3         0.337     0.135
--------------------------------------------------------------------------------
Execution time for call of fit: 0 hours, 0 minutes, 1.5084 seconds
--------------------------------------------------------------------------------
#points   R         C         min       max       #clusters %largest  %noise
5000      0.150     10        2         None      3         0.326     0.028
--------------------------------------------------------------------------------
Execution time for call of fit: 0 hours, 0 minutes, 3.5528 seconds
--------------------------------------------------------------------------------
#points   R         C         min       max       #clusters %largest  %noise
5000      0.200     5         2         None      3         0.333     0.000
--------------------------------------------------------------------------------
Execution time for call of fit: 0 hours, 0 minutes, 1.3277 seconds
--------------------------------------------------------------------------------
#points   R         C         min       max       #clusters %largest  %noise
5000      0.200     5         2         None      1         1.000     0.000
--------------------------------------------------------------------------------

cnnclustering CNN from points bulk

[20]:
# Plot the test data sets
plt.close("all")
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

generation_fxns = [
    gen_circles, gen_moons, gen_varied,
    gen_aniso, gen_blobs, gen_no_structure,
    ]

fit_params = [
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # circles
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # moons
    {"radius_cutoff": 0.18, "cnn_cutoff": 20, "member_cutoff": 8},  # varied
    {"radius_cutoff": 0.15, "cnn_cutoff": 10},                      # aniso
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # blobs
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # no structure
    ]

for count, (gen_fxn, params) in enumerate(zip(generation_fxns, fit_params)):
    # Fit
    data = gen_fxn(5000)
    clustering = cnn.CNN(data)
    clustering.fit(**params, rec=True, policy="progressive")

    # Plot
    plot_data(data, labels=clustering.labels, ax=Ax[count], noise=0)
    Ax[count].set(**ax_props)

    try:
        name = gen_fxn.label
    except AttributeError:
        name = gen_fxn.__name__

    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0.05, top=0.9, wspace=0, hspace=0.2
    )

plt.show()
Execution time for call of fit: 0 hours, 0 minutes, 0.1380 seconds
--------------------------------------------------------------------------------
#points   R         C         min       max       #clusters %largest  %noise
5000      0.200     5         2         None      2         0.500     0.001
--------------------------------------------------------------------------------
Execution time for call of fit: 0 hours, 0 minutes, 0.1805 seconds
--------------------------------------------------------------------------------
#points   R         C         min       max       #clusters %largest  %noise
5000      0.200     5         2         None      2         0.500     0.000
--------------------------------------------------------------------------------
Execution time for call of fit: 0 hours, 0 minutes, 0.4791 seconds
--------------------------------------------------------------------------------
#points   R         C         min       max       #clusters %largest  %noise
5000      0.180     20        8         None      3         0.337     0.135
--------------------------------------------------------------------------------
Execution time for call of fit: 0 hours, 0 minutes, 0.2958 seconds
--------------------------------------------------------------------------------
#points   R         C         min       max       #clusters %largest  %noise
5000      0.150     10        2         None      3         0.326     0.028
--------------------------------------------------------------------------------
Execution time for call of fit: 0 hours, 0 minutes, 1.1661 seconds
--------------------------------------------------------------------------------
#points   R         C         min       max       #clusters %largest  %noise
5000      0.200     5         2         None      3         0.333     0.000
--------------------------------------------------------------------------------
Execution time for call of fit: 0 hours, 0 minutes, 0.1053 seconds
--------------------------------------------------------------------------------
#points   R         C         min       max       #clusters %largest  %noise
5000      0.200     5         2         None      1         1.000     0.000
--------------------------------------------------------------------------------

cnnclustering CNN from distances on-the-fly

[21]:
# Plot the test data sets
plt.close("all")
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

generation_fxns = [
    gen_circles, gen_moons, gen_varied,
    gen_aniso, gen_blobs, gen_no_structure,
    ]

fit_params = [
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # circles
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # moons
    {"radius_cutoff": 0.18, "cnn_cutoff": 20, "member_cutoff": 8},  # varied
    {"radius_cutoff": 0.15, "cnn_cutoff": 10},                      # aniso
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # blobs
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # no structure
    ]

for count, (gen_fxn, params) in enumerate(zip(generation_fxns, fit_params)):
    # Fit
    data = gen_fxn(5000)
    clustering = cnn.CNN(data)
    clustering.calc_dist()
    clustering.fit(**params, rec=False, policy="conservative")

    # Plot
    plot_data(data, labels=clustering.labels, ax=Ax[count], noise=0)
    Ax[count].set(**ax_props)

    try:
        name = gen_fxn.label
    except AttributeError:
        name = gen_fxn.__name__

    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0.05, top=0.9, wspace=0, hspace=0.2
    )

plt.show()

cnnclustering CNN from distances bulk

[22]:
# Plot the test data sets
plt.close("all")
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

generation_fxns = [
    gen_circles, gen_moons, gen_varied,
    gen_aniso, gen_blobs, gen_no_structure,
    ]

fit_params = [
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # circles
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # moons
    {"radius_cutoff": 0.18, "cnn_cutoff": 20, "member_cutoff": 8},  # varied
    {"radius_cutoff": 0.15, "cnn_cutoff": 10},                      # aniso
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # blobs
    {"radius_cutoff": 0.2, "cnn_cutoff": 5},                        # no structure
    ]

for count, (gen_fxn, params) in enumerate(zip(generation_fxns, fit_params)):
    # Fit
    data = gen_fxn(5000)
    clustering = cnn.CNN(data)
    clustering.calc_dist()
    clustering.fit(**params, rec=False, policy="progressive")

    # Plot
    plot_data(data, labels=clustering.labels, ax=Ax[count], noise=0)
    Ax[count].set(**ax_props)

    try:
        name = gen_fxn.label
    except AttributeError:
        name = gen_fxn.__name__

    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0.05, top=0.9, wspace=0, hspace=0.2
    )

plt.show()

scikit-learn OPTICS (DBSCAN)

[79]:
# Plot the test data sets
plt.close("all")
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

generation_fxns = [
    gen_circles, gen_moons, gen_varied,
    gen_aniso, gen_blobs, gen_no_structure,
    ]

fit_params = [
    {"max_eps": 0.25, "min_samples": 5, "cluster_method": "dbscan"},    # circles
    {"max_eps": 0.25, "min_samples": 5, "cluster_method": "dbscan"},    # moons
    {"max_eps": 0.125, "min_samples": 20, "cluster_method": "dbscan"},   # varied
    {"max_eps": 0.15, "min_samples": 20, "cluster_method": "dbscan"},   # aniso
    {"max_eps": 1, "min_samples": 5, "cluster_method": "dbscan"},    # blobs
    {"max_eps": 1, "min_samples": 5, "cluster_method": "dbscan"},    # no structure
    ]

for count, (gen_fxn, params) in enumerate(zip(generation_fxns, fit_params)):
    # Fit
    data = gen_fxn(5000)
    clustering = skcluster.OPTICS(**params)
    clustering.fit(data)

    # Plot
    plot_data(data, labels=clustering.labels_, ax=Ax[count], noise=-1)
    Ax[count].set(**ax_props)

    try:
        name = gen_fxn.label
    except AttributeError:
        name = gen_fxn.__name__

    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0.05, top=0.9, wspace=0, hspace=0.2
    )

plt.show()

scikit-learn OPTICS (XI)

[88]:
# Plot the test data sets
plt.close("all")
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

generation_fxns = [
    gen_circles, gen_moons, gen_varied,
    gen_aniso, gen_blobs, gen_no_structure,
    ]

fit_params = [
    {"min_samples": 5, "xi": 0.05, "min_cluster_size": 0.4},    # circles
    {"min_samples": 5, "xi": 0.05, "min_cluster_size": 0.4},    # moons
    {"min_samples": 10, "xi": 0.01, "min_cluster_size": 0.2},   # varied
    {"min_samples": 20, "xi": 0.03, "min_cluster_size": 0.1},   # aniso
    {"min_samples": 5, "xi": 0.05, "min_cluster_size": 0.3},    # blobs
    {"min_samples": 20, "xi": 0.2, "min_cluster_size": 0.2},    # no structure
    ]

for count, (gen_fxn, params) in enumerate(zip(generation_fxns, fit_params)):
    # Fit
    data = gen_fxn(5000)
    clustering = skcluster.OPTICS(**params)
    clustering.fit(data)

    # Plot
    plot_data(data, labels=clustering.labels_, ax=Ax[count], noise=-1)
    Ax[count].set(**ax_props)

    try:
        name = gen_fxn.label
    except AttributeError:
        name = gen_fxn.__name__

    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0.05, top=0.9, wspace=0, hspace=0.2
    )

plt.show()

Timings

Blobs set

[30]:
benchmark_signatures = [
    ("DBSCAN", skcluster.dbscan, ("DS_ATTR:points", ), {"default": {"eps": 0.2, "min_samples": 5}}),
    ("CommonNN", skextracluster.commonnn, ("DS_ATTR:points", ), {"default": {"eps": 0.2, "min_samples": 5}}),
]
[64]:
# TIMINGS = {}
TIMINGS.update(time_runs(benchmark_signatures, gen_blobs, samples=[20000]))
--------------------------------------------------------------------------------
20000
         DBSCAN:   1.000
       CommonNN:  16.444
[65]:
fig, ax = plt.subplots()
x = TIMINGS.keys()

lines = []
for l in ["DBSCAN", "CommonNN"]:
    x, y = zip(*[
        [k, v[l].average]
        for k, v in TIMINGS.items()
        if v.get(l, None) is not None
    ])
    lines.append(ax.plot(x, y))
plt.show()
[80]:

[80]:
[[200, 0.001104492306285725],
 [2000, 0.0010950690424282844],
 [5000, 0.0010891374409994958]]

Fit variants

Benchmarks for different approaches to the common-nearest-neighbours fit function.

[13]:
# Benchmark results will be saved under:
report_dir = "reports/T460"
# report_dir = "reports/qcw21"
# report_dir = "reports/qcm07"

From neighbours

Tests of fit functions taking pre-computed neighbourhoods as input. Two ways of setting up the benchmarks are provided: - Use prepare_neighbours function to quickly generate the input data - Use DS benchmark class to organise different runs on essentially the same data set

[30]:
ds = DS(gen_circles, 100)

From list of sets

[25]:
# Prepare neighbours as list of sets
ds.prepare_neighbourhoods(0.5)
print(ds)
circles_2000_n0.5
Baseline
[110]:
# Implementation using original implementation
profile_fxn(
    fits.fit_from_neighbours_baseline,
    ds, report_dir,
    20, ds.neighbours,  # function args
    label="baseline"
    )
Length of labels:    2000
Noise:               0
Largest:             1000
Clusters:            2

*** Profile printout saved to text file 'reports/T460/fit_from_neighbours_baseline_circles_2000_n0.5.lprun'.
256 ms ± 5.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Stdlib index
[47]:
plt.close("all")
fig, ax = plt.subplots()
plot_data(ds.points, labels=np.asarray(fits.fit_stdlib_from_neighbours_index(20, ds.neighbourhoods)), ax=ax, noise=0)
plt.show()
[45]:
# Implementation using only standard library
profile_fxn(
    fits.fit_stdlib_from_neighbours_index,
    ds, report_dir,
    20, ds.neighbourhoods,  # function args
    label="std_index"
    )
Length of labels:    2000
Noise:               0
Largest:             1000
Clusters:            2

*** Profile printout saved to text file 'reports/T460/fit_stdlib_from_neighbours_index_circles_2000_n0.5.lprun'.
36.3 ms ± 88 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
[111]:
plt.close("all")
fig, ax = plt.subplots()
plot_data(ds.points, labels=np.asarray(fits.fit_stdlib_from_neighbours_loop(20, ds.neighbours)), ax=ax, noise=0)
plt.show()
[113]:
# Implementation using only standard library
profile_fxn(
    fits.fit_stdlib_from_neighbours_loop,
    ds, report_dir,
    20, ds.neighbours,  # function args
    label="std_loop"
    )
Length of labels:    2000
Noise:               0
Largest:             1000
Clusters:            2

*** Profile printout saved to text file 'reports/T460/fit_stdlib_from_neighbours_loop_circles_2000_n0.5.lprun'.
40 ms ± 511 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
[114]:
plt.close("all")
fig, ax = plt.subplots()
plot_data(ds.points, labels=np.asarray(fits.fit_stdlib_from_neighbours_loop_membercheck(20, ds.neighbours)), ax=ax, noise=0)
plt.show()
[115]:
# Implementation using only standard library
profile_fxn(
    fits.fit_stdlib_from_neighbours_loop_membercheck,
    ds, report_dir,
    20, ds.neighbours,  # function args
    label="std_loop_membercheck"
    )
Length of labels:    2000
Noise:               0
Largest:             1000
Clusters:            2

*** Profile printout saved to text file 'reports/T460/fit_stdlib_from_neighbours_loop_membercheck_circles_2000_n0.5.lprun'.
41.8 ms ± 3.99 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Stdlib cython
[29]:
plt.close("all")
fig, ax = plt.subplots()
plot_data(ds.points, labels=np.asarray(_cfits.fit_from_neighbours(20, ds.neighbourhoods)), ax=ax, noise=0)
plt.show()
[31]:
# Implementation using only standard library
profile_fxn(
    _cfits.fit_from_neighbours,
    ds, report_dir,
    20, ds.neighbourhoods,  # function args
    label="std_cython"
    )
Length of labels:    2000
Noise:               0
Largest:             1000
Clusters:            2
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-31-14cc7f3c97a4> in <module>
      1 # Implementation using only standard library
----> 2 profile_fxn(
      3     _cfits.fit_from_neighbours,
      4     ds, report_dir,
      5     20, ds.neighbourhoods,  # function args

<ipython-input-22-1405b8a32e8f> in profile_fxn(f, ds, report_dir, t, l, m, label, *args, **kwargs)
     68     # Profile
     69     if l:
---> 70         get_ipython().run_line_magic('lprun', '-T {report_dir}/{fxn.__name__}_{ds.__str__()}.lprun -f fxn fxn(*fxn_args, **fxn_kwargs)')
     71     if t:
     72         o = get_ipython().run_line_magic('timeit', '-o fxn(*fxn_args, **fxn_kwargs)')

~/.local/share/virtualenvs/CNN-5gkgQAOT/lib/python3.8/site-packages/IPython/core/interactiveshell.py in run_line_magic(self, magic_name, line, _stack_depth)
   2315                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2316             with self.builtin_trap:
-> 2317                 result = fn(*args, **kwargs)
   2318             return result
   2319

<decorator-gen-129> in lprun(self, parameter_s)

~/.local/share/virtualenvs/CNN-5gkgQAOT/lib/python3.8/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
    185     # but it's overkill for just that one bit of state.
    186     def magic_deco(arg):
--> 187         call = lambda f, *a, **k: f(*a, **k)
    188
    189         if callable(arg):

~/.local/share/virtualenvs/CNN-5gkgQAOT/lib/python3.8/site-packages/line_profiler/line_profiler.py in lprun(self, parameter_s)
    374         # Trap text output.
    375         stdout_trap = StringIO()
--> 376         profile.print_stats(stdout_trap, output_unit=output_unit, stripzeros='s' in opts)
    377         output = stdout_trap.getvalue()
    378         output = output.rstrip()

~/.local/share/virtualenvs/CNN-5gkgQAOT/lib/python3.8/site-packages/line_profiler/line_profiler.py in print_stats(self, stream, output_unit, stripzeros)
    142         """
    143         lstats = self.get_stats()
--> 144         show_text(lstats.timings, lstats.unit, output_unit=output_unit, stream=stream, stripzeros=stripzeros)
    145
    146     def run(self, cmd):

~/.local/share/virtualenvs/CNN-5gkgQAOT/lib/python3.8/site-packages/line_profiler/line_profiler.py in show_text(stats, unit, output_unit, stream, stripzeros)
    263
    264     for (fn, lineno, name), timings in sorted(stats.items()):
--> 265         show_func(fn, lineno, name, stats[fn, lineno, name], unit,
    266             output_unit=output_unit, stream=stream, stripzeros=stripzeros)
    267

~/.local/share/virtualenvs/CNN-5gkgQAOT/lib/python3.8/site-packages/line_profiler/line_profiler.py in show_func(filename, start_lineno, func_name, timings, unit, output_unit, stream, stripzeros)
    227         stream.write("Continuing without the function's contents.\n")
    228         # Fake empty lines so we can see the timings, if not the code.
--> 229         nlines = max(linenos) - min(min(linenos), start_lineno) + 1
    230         sublines = [''] * nlines
    231     for lineno, nhits, time in timings:

ValueError: max() arg is an empty sequence
[32]:
%timeit _cfits.fit_from_neighbours(20, ds.neighbourhoods)
25.8 ms ± 541 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

From numpy.array

[136]:
importlib.reload(fits)
[136]:
<module 'snippets.fits' from '/home/janjoswig/CNN/tests/benchmark/snippets/fits.py'>
[118]:
# Switch to neighbourhoods as numpy.array of numpy.arrays
ds.prepare_neighbours(0.5, numpy=True)
[142]:
plt.close("all")
fig, ax = plt.subplots()
plot_data(ds.points, labels=baseline_to_labels(fits.fit_from_neighbours_baseline(20, ds.neighbours)), ax=ax, noise=0)
plt.show()
[140]:
# Implementation using original implementation
profile_fxn(
    fits.fit_from_neighbours_baseline,
    ds, report_dir,
    20, ds.neighbours,  # function args
    label="baseline"
    )
Length of labels:    2000
Noise:               0
Largest:             1000
Clusters:            2

*** Profile printout saved to text file 'reports/T460/fit_from_neighbours_baseline_circles_2000_n0.5.lprun'.
368 ms ± 57.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[137]:
plt.close("all")
fig, ax = plt.subplots()
plot_data(ds.points, labels=fits.fit_numpy_mix(20, ds.neighbours), ax=ax, noise=0)
plt.show()
[143]:
# Implementation using numpy
profile_fxn(
    fits.fit_numpy_mix,
    ds, report_dir,
    20, ds.neighbours,  # function args
    label="numpy_index"
    )
Length of labels:    2000
Noise:               0
Largest:             1000
Clusters:            2

*** Profile printout saved to text file 'reports/T460/fit_numpy_mix_circles_2000_n0.5.lprun'.
146 ms ± 1.58 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
[119]:
plt.close("all")
fig, ax = plt.subplots()
plot_data(ds.points, labels=np.asarray(fits.fit_numpy_from_neighbours_index(20, ds.neighbours)), ax=ax, noise=0)
plt.show()
[139]:
# Implementation using numpy
profile_fxn(
    fits.fit_numpy_from_neighbours_index,
    ds, report_dir,
    20, ds.neighbours,  # function args
    label="numpy_index"
    )
Length of labels:    2000
Noise:               0
Largest:             1000
Clusters:            2

*** Profile printout saved to text file 'reports/T460/fit_numpy_from_neighbours_index_circles_2000_n0.5.lprun'.
103 ms ± 19.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
[121]:
plt.close("all")
fig, ax = plt.subplots()
plot_data(ds.points, labels=np.asarray(fits.fit_numpy_from_neighbours_loop(20, ds.neighbours)), ax=ax, noise=0)
plt.show()
[123]:
# Implementation using numpy
profile_fxn(
    fits.fit_numpy_from_neighbours_loop,
    ds, report_dir,
    20, ds.neighbours,  # function args
    label="numpy_loop"
    )
Length of labels:    2000
Noise:               0
Largest:             1000
Clusters:            2

*** Profile printout saved to text file 'reports/T460/fit_numpy_from_neighbours_loop_circles_2000_n0.5.lprun'.
96.9 ms ± 2.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
[124]:
plt.close("all")
fig, ax = plt.subplots()
plot_data(ds.points, labels=np.asarray(fits.fit_numpy_from_neighbours_filtermembers(20, ds.neighbours)), ax=ax, noise=0)
plt.show()
[130]:
# Implementation using numpy
profile_fxn(
    fits.fit_numpy_from_neighbours_filtermembers,
    ds, report_dir,
    20, ds.neighbours,  # function args
    label="numpy_filter"
    )
Length of labels:    2000
Noise:               0
Largest:             1000
Clusters:            2

*** Profile printout saved to text file 'reports/T460/fit_numpy_from_neighbours_filtermembers_circles_2000_n0.5.lprun'.
43.8 ms ± 965 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
[132]:
importlib.reload(fits)
[132]:
<module 'snippets.fits' from '/home/janjoswig/CNN/tests/benchmark/snippets/fits.py'>
[129]:
plt.close("all")
fig, ax = plt.subplots()
plot_data(ds.points, labels=np.asarray(fits.fit_numpy_from_neighbours_membercheck(20, ds.neighbours)), ax=ax, noise=0)
plt.show()
[131]:
# Implementation using numpy
profile_fxn(
    fits.fit_numpy_from_neighbours_membercheck,
    ds, report_dir,
    20, ds.neighbours,  # function args
    label="numpy_filter"
    )
Length of labels:    2000
Noise:               0
Largest:             1000
Clusters:            2

*** Profile printout saved to text file 'reports/T460/fit_numpy_from_neighbours_membercheck_circles_2000_n0.5.lprun'.
46.7 ms ± 4.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
[73]:
# Implementation using cythonised numpy
profile_fxn(
    cfits.cfit_from_neighbours,
    ds, report_dir,
    1, ds.neighbours,  # function args
    label="cython_numpy_loop",
    l=False
    )
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-73-d8c72077d3b4> in <module>
      1 # Implementation using cythonised numpy
----> 2 profile_fxn(
      3     cfits.cfit_from_neighbours,
      4     ds, report_dir,
      5     1, ds.neighbours,  # function args

<ipython-input-65-3b108bdce0ce> in profile_fxn(f, ds, report_dir, t, l, m, label, *args, **kwargs)
     44
     45     # Validate function result
---> 46     result = fxn(*fxn_args, **fxn_kwargs)
     47     if isinstance(result, list) and isinstance(result[0], np.ndarray):
     48         # Convert result if from original implementation

~/CNN/tests/benchmark/snippets/cfits.pyx in cfits.cfit_from_neighbours()

ValueError: Buffer dtype mismatch, expected 'npy_intp' but got 'double'
[34]:
for x in ds.ratios():
    print(f"{x[0]:>15}: {x[1]:7.3f}")
        std_cnn:   1.000

Check in CNN class context

[16]:
cobj = cnn.CNN(neighbourhoods=cnn.NeighbourhoodsList(ds.neighbourhoods, 0.5))
profile_fxn(
    cobj.fit,
    ds, report_dir,
    0.5, 20,  # function args
    label="std_cnn",
    rec=False
    )

*** Profile printout saved to text file 'reports/T460/fit_circles_2000_n0.5.lprun'.
31.7 ms ± 447 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
[17]:
cobj.labels
[17]:
Labels([1, 2, 2, ..., 1, 2, 2])
[27]:
importlib.reload(cfits)
[27]:
<module 'cfits' from '/home/janjoswig/CNN/cfits.cpython-38-x86_64-linux-gnu.so'>

From density graph

From SparsegraphArray

[17]:
importlib.reload(cnn)
importlib.reload(_cfits)
[17]:
<module 'core._cfits' from '/home/janjoswig/CNN/core/_cfits.cpython-38-x86_64-linux-gnu.so'>
[27]:
Graph = cnn.SparsegraphArray(*_cfits.NeighbourhoodsList2SparsegraphArray(ds.neighbourhoods, 20))
[28]:
Graph._indices.shape[0]
[28]:
2001
[32]:
%timeit labels = cnn.Labels(_cfits.bfs_SparsegraphArray(Graph, Graph._indices))
417 µs ± 9.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
[31]:
labels
[31]:
Labels([1, 2, 2, ..., 1, 2, 2])