From 8a55454557cc9a0b41e51c25ef4857abf3db55ca Mon Sep 17 00:00:00 2001
From: Andrew-Chen-Wang <acwangpython@gmail.com>
Date: Thu, 9 Jul 2020 18:33:05 -0400
Subject: [PATCH] Support benchmarks for MacOS * Added how to run benchmarks in
 docs and README

---
 README.rst         |  35 +++++-
 benchmark.py       | 304 +++++++++++++++++++++++++++------------------
 docs/benchmark.rst |  11 ++
 3 files changed, 224 insertions(+), 126 deletions(-)

diff --git a/README.rst b/README.rst
index 09cfb29..f355d45 100644
--- a/README.rst
+++ b/README.rst
@@ -22,22 +22,33 @@ Documentation: http://django-cachalot.readthedocs.io
 .. image:: https://img.shields.io/badge/cachalot-Chat%20on%20Slack-green?style=flat&logo=slack
     :target: https://join.slack.com/t/cachalotdjango/shared_invite/zt-dd0tj27b-cIH6VlaSOjAWnTG~II5~qw
 
+----
+
+Table of Contents:
+
+- Quickstart
+- Usage
+- Hacking
+- Benchmark
+- Third-Party Cache Comparison
+- Discussion
+
 Quickstart
 ----------
 
 Cachalot officially supports Python 3.5-3.8 and Django 2.0-2.2, 3.0 with the databases PostgreSQL, SQLite, and MySQL.
 
 Usage
-.....
+-----
 
 #. ``pip install django-cachalot``
 #. Add ``'cachalot',`` to your ``INSTALLED_APPS``
 #. If you use multiple servers with a common cache server,
-   :ref:`double check their clock synchronisation <https://django-cachalot.readthedocs.io/en/latest/limits.html#multiple-servers>`_
+   `double check their clock synchronisation <https://django-cachalot.readthedocs.io/en/latest/limits.html#multiple-servers>`_
 #. If you modify data outside Django
    – typically after restoring a SQL database –,
-   use the :ref:`manage.py command <https://django-cachalot.readthedocs.io/en/latest/quickstart.html#command>`_
-#. Be aware of :ref:`the few other limits <https://django-cachalot.readthedocs.io/en/latest/limits.html#limits>`_
+   use the `manage.py command <https://django-cachalot.readthedocs.io/en/latest/quickstart.html#command>`_
+#. Be aware of `the few other limits <https://django-cachalot.readthedocs.io/en/latest/limits.html#limits>`_
 #. If you use
    `django-debug-toolbar <https://github.com/jazzband/django-debug-toolbar>`_,
    you can add ``'cachalot.panels.CachalotPanel',``
@@ -63,6 +74,22 @@ For setup:
 #. For PostgreSQL: ``CREATE ROLE cachalot LOGIN SUPERUSER;``
 #. Run: ``tox --current-env`` to run the test suite on your current Python version.
 
+Benchmark
+---------
+
+Currently, benchmarks are supported on Linux and Mac/Darwin.
+You will need a database called "cachalot" on MySQL and PostgreSQL.
+Additionally, on PostgreSQL, you will need to create a role
+called "cachalot." You can also run the benchmark, and it'll raise
+errors with specific instructions for how to fix it.
+
+#. Install: ``pip install -r requirements/benchmark.txt``
+#. Run: ``python benchmark.py``
+
+The output will be in benchmark/TODAY'S_DATE/
+
+TODO Create Docker-compose file to allow for easier running of data.
+
 Third-Party Cache Comparison
 ----------------------------
 
diff --git a/benchmark.py b/benchmark.py
index c240ee5..c7ebd57 100755
--- a/benchmark.py
+++ b/benchmark.py
@@ -1,48 +1,49 @@
-from collections import OrderedDict
 import io
 import os
 import platform
-from random import choice
 import re
 import sqlite3
+from collections import OrderedDict
+from datetime import datetime
+from random import choice
 from subprocess import check_output
 from time import time
 
-
-os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings')
-
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings")
 import django
 django.setup()
 
-from django.conf import settings
-from django.contrib.auth.models import User, Group
-from django.core.cache import caches
-from django.db import connections, connection
-from django.test.utils import CaptureQueriesContext, override_settings
-from django.utils.encoding import force_text
 import matplotlib.pyplot as plt
-import _mysql
 import pandas as pd
 import psycopg2
+from django.conf import settings
+from django.contrib.auth.models import Group, User
+from django.core.cache import caches
+from django.db import connection, connections
+from django.test.utils import CaptureQueriesContext, override_settings
+from django.utils.encoding import force_text
+from MySQLdb import _mysql
 
 import cachalot
 from cachalot.api import invalidate
 from cachalot.tests.models import Test
 
 
-RESULTS_PATH = 'benchmark/'
-DATA_PATH = '/var/lib/'
-CONTEXTS = ('Control', 'Cold cache', 'Hot cache')
-DIVIDER = 'divider'
+RESULTS_PATH = f"benchmark/docs/{datetime.now().date()}/"
+CONTEXTS = ("Control", "Cold cache", "Hot cache")
+DIVIDER = "divider"
+
+LINUX_DATA_PATH = "/var/lib/"
 DISK_DATA_RE = re.compile(r'^MODEL="(.*)" MOUNTPOINT="(.*)"$')
 
 
-def get_disk_model_for_path(path):
-    out = force_text(check_output(['lsblk', '-Po', 'MODEL,MOUNTPOINT']))
+def get_disk_model_for_path_linux(path):
+    out = force_text(check_output(["lsblk", "-Po", "MODEL,MOUNTPOINT"]))
     mount_points = []
     previous_model = None
-    for model, mount_point in [DISK_DATA_RE.match(line).groups()
-                               for line in out.split('\n') if line]:
+    for model, mount_point in [
+        DISK_DATA_RE.match(line).groups() for line in out.split("\n") if line
+    ]:
         if model:
             previous_model = model.strip()
         if mount_point:
@@ -55,65 +56,99 @@ def get_disk_model_for_path(path):
 
 def write_conditions():
     versions = OrderedDict()
+    distribution = platform.uname()
 
-    # CPU
-    with open('/proc/cpuinfo') as f:
-        versions['CPU'] = re.search(r'^model name\s+: (.+)$', f.read(),
-                                    flags=re.MULTILINE).group(1)
-    # RAM
-    with open('/proc/meminfo') as f:
-        versions['RAM'] = re.search(r'^MemTotal:\s+(.+)$', f.read(),
-                                    flags=re.MULTILINE).group(1)
-    versions.update((
-        ('Disk', get_disk_model_for_path(DATA_PATH)),
-    ))
-    # OS
-    linux_dist = ' '.join(platform.linux_distribution()).strip()
-    if linux_dist:
-        versions['Linux distribution'] = linux_dist
+    # Linux
+    if distribution.system == "Linux":
+        # CPU
+        with open("/proc/cpuinfo") as f:
+            versions["CPU"] = re.search(
+                r"^model name\s+: (.+)$", f.read(), flags=re.MULTILINE
+            ).group(1)
+        # RAM
+        with open("/proc/meminfo") as f:
+            versions["RAM"] = re.search(
+                r"^MemTotal:\s+(.+)$", f.read(), flags=re.MULTILINE
+            ).group(1)
+        # Disk Model
+        versions.update((("Disk", get_disk_model_for_path_linux(LINUX_DATA_PATH)),))
+        # OS
+        versions["Linux distribution"] = f"{distribution.system} {distribution.release}"
+    # Darwin
     else:
-        versions['OS'] = platform.system() + ' ' + platform.release()
+        # CPU
+        versions["CPU"] = os.popen("sysctl -n machdep.cpu.brand_string").read().rstrip("\n")
+        # RAM
+        versions["RAM"] = os.popen("sysctl -n hw.memsize").read().rstrip("\n")
+        # Disk Model
+        versions["DISK"] = os.popen(
+            "diskutil info /dev/disk0 | grep 'Device / Media Name'"
+        ).read().split(":")[1].rstrip("\n").lstrip(" ")
+        # OS
+        versions["OS"] = f"{distribution.system} {distribution.release}"
 
-    versions.update((
-        ('Python', platform.python_version()),
-        ('Django', django.__version__),
-        ('cachalot', cachalot.__version__),
-        ('sqlite', sqlite3.sqlite_version),
-    ))
+    versions.update(
+        (
+            ("Python", platform.python_version()),
+            ("Django", django.__version__),
+            ("cachalot", cachalot.__version__),
+            ("sqlite", sqlite3.sqlite_version),
+        )
+    )
     # PostgreSQL
-    with connections['postgresql'].cursor() as cursor:
-        cursor.execute('SELECT version();')
-        versions['PostgreSQL'] = re.match(r'^PostgreSQL\s+(\S+)\s',
-                                          cursor.fetchone()[0]).group(1)
+    try:
+        with connections["postgresql"].cursor() as cursor:
+            cursor.execute("SELECT version();")
+            versions["PostgreSQL"] = re.match(
+                r"^PostgreSQL\s+(\S+)\s", cursor.fetchone()[0]
+            ).group(1)
+    except django.db.utils.OperationalError:
+        raise django.db.utils.OperationalError(
+            "You need a PostgreSQL DB called \"cachalot\" first. "
+            "Login with \"psql -U postgres -h localhost\" and run: "
+            "CREATE DATABASE cachalot;"
+        )
     # MySQL
-    with connections['mysql'].cursor() as cursor:
-        cursor.execute('SELECT version();')
-        versions['MySQL'] = cursor.fetchone()[0].split('-')[0]
+    try:
+        with connections["mysql"].cursor() as cursor:
+            cursor.execute("SELECT version();")
+            versions["MySQL"] = cursor.fetchone()[0].split("-")[0]
+    except django.db.utils.OperationalError:
+        raise django.db.utils.OperationalError(
+            "You need a MySQL DB called \"cachalot\" first. "
+            "Login with \"mysql -u root\" and run: CREATE DATABASE cachalot;"
+        )
     # Redis
-    out = force_text(
-        check_output(['redis-cli', 'INFO', 'server'])).replace('\r', '')
-    versions['Redis'] = re.search(r'^redis_version:([\d\.]+)$', out,
-                                  flags=re.MULTILINE).group(1)
+    out = force_text(check_output(["redis-cli", "INFO", "server"])).replace("\r", "")
+    versions["Redis"] = re.search(
+        r"^redis_version:([\d\.]+)$", out, flags=re.MULTILINE
+    ).group(1)
     # memcached
-    out = force_text(check_output(['memcached', '-h']))
-    versions['memcached'] = re.match(r'^memcached ([\d\.]+)$', out,
-                                     flags=re.MULTILINE).group(1)
+    out = force_text(check_output(["memcached", "-h"]))
+    versions["memcached"] = re.match(
+        r"^memcached ([\d\.]+)$", out, flags=re.MULTILINE
+    ).group(1)
 
-    versions.update((
-        ('psycopg2', psycopg2.__version__.split()[0]),
-        ('mysqlclient', _mysql.__version__),
-    ))
+    versions.update(
+        (
+            ("psycopg2", psycopg2.__version__.split()[0]),
+            ("mysqlclient", _mysql.__version__),
+        )
+    )
 
-    with io.open(os.path.join('benchmark', 'conditions.rst'), 'w') as f:
-        f.write('In this benchmark, a small database is generated, '
-                'and each test is executed %s times '
-                'under the following conditions:\n\n' % Benchmark.n)
+    with io.open(os.path.join(RESULTS_PATH, "conditions.rst"), "w") as f:
+        f.write(
+            "In this benchmark, a small database is generated, "
+            "and each test is executed %s times "
+            "under the following conditions:\n\n" % Benchmark.n
+        )
+
+        def write_table_sep(char="="):
+            f.write((char * 20) + " " + (char * 50) + "\n")
 
-        def write_table_sep(char='='):
-            f.write((char * 20) + ' ' + (char * 50) + '\n')
         write_table_sep()
         for k, v in versions.items():
-            f.write(k.ljust(20) + ' ' + v + '\n')
+            f.write(k.ljust(20) + " " + v + "\n")
         write_table_sep()
 
 
@@ -131,8 +166,10 @@ class AssertNumQueries(CaptureQueriesContext):
     def __exit__(self, exc_type, exc_val, exc_tb):
         super(AssertNumQueries, self).__exit__(exc_type, exc_val, exc_tb)
         if len(self) != self.n:
-            print('The amount of queries should be %s, but %s were captured.'
-                  % (self.n, len(self)))
+            print(
+                "The amount of queries should be %s, but %s were captured."
+                % (self.n, len(self))
+            )
 
 
 class Benchmark(object):
@@ -150,11 +187,14 @@ class Benchmark(object):
                 self.query_function(self.db_alias)
                 end = time()
             self.data.append(
-                {'query': self.query_name,
-                 'time': end - start,
-                 'context': context,
-                 'db': self.db_vendor,
-                 'cache': self.cache_name})
+                {
+                    "query": self.query_name,
+                    "time": end - start,
+                    "context": context,
+                    "db": self.db_vendor,
+                    "cache": self.cache_name,
+                }
+            )
 
     def benchmark(self, query_str, to_list=True, num_queries=1):
         # Clears the cache before a single benchmark to ensure the same
@@ -162,10 +202,10 @@ class Benchmark(object):
         caches[settings.CACHALOT_CACHE].clear()
 
         self.query_name = query_str
-        query_str = 'Test.objects.using(using)' + query_str
+        query_str = "Test.objects.using(using)" + query_str
         if to_list:
-            query_str = 'list(%s)' % query_str
-        self.query_function = eval('lambda using: ' + query_str)
+            query_str = "list(%s)" % query_str
+        self.query_function = eval("lambda using: " + query_str)
 
         with override_settings(CACHALOT_ENABLED=False):
             self.bench_once(CONTEXTS[0], num_queries)
@@ -175,26 +215,24 @@ class Benchmark(object):
         self.bench_once(CONTEXTS[2], 0)
 
     def execute_benchmark(self):
-        self.benchmark('.count()', to_list=False)
-        self.benchmark('.first()', to_list=False)
-        self.benchmark('[:10]')
-        self.benchmark('[5000:5010]')
+        self.benchmark(".count()", to_list=False)
+        self.benchmark(".first()", to_list=False)
+        self.benchmark("[:10]")
+        self.benchmark("[5000:5010]")
         self.benchmark(".filter(name__icontains='e')[0:10]")
         self.benchmark(".filter(name__icontains='e')[5000:5010]")
         self.benchmark(".order_by('owner')[0:10]")
         self.benchmark(".order_by('owner')[5000:5010]")
         self.benchmark(".select_related('owner')[0:10]")
         self.benchmark(".select_related('owner')[5000:5010]")
-        self.benchmark(".prefetch_related('owner__groups')[0:10]",
-                       num_queries=3)
-        self.benchmark(".prefetch_related('owner__groups')[5000:5010]",
-                       num_queries=3)
+        self.benchmark(".prefetch_related('owner__groups')[0:10]", num_queries=3)
+        self.benchmark(".prefetch_related('owner__groups')[5000:5010]", num_queries=3)
 
     def run(self):
         for db_alias in settings.DATABASES:
             self.db_alias = db_alias
             self.db_vendor = connections[self.db_alias].vendor
-            print('Benchmarking %s…' % self.db_vendor)
+            print("Benchmarking %s…" % self.db_vendor)
             for cache_alias in settings.CACHES:
                 cache = caches[cache_alias]
                 self.cache_name = cache.__class__.__name__[:-5].lower()
@@ -204,87 +242,109 @@ class Benchmark(object):
         self.df = pd.DataFrame.from_records(self.data)
         if not os.path.exists(RESULTS_PATH):
             os.mkdir(RESULTS_PATH)
-        self.df.to_csv(os.path.join(RESULTS_PATH, 'data.csv'))
+        self.df.to_csv(os.path.join(RESULTS_PATH, "data.csv"))
 
-        self.xlim = (0, self.df['time'].max() * 1.01)
-        self.output('db')
-        self.output('cache')
+        self.xlim = (0, self.df["time"].max() * 1.01)
+        self.output("db")
+        self.output("cache")
 
     def output(self, param):
-        gp = self.df.groupby(['context', 'query', param])['time']
+        gp = self.df.groupby(["context", "query", param])["time"]
         self.means = gp.mean().unstack().unstack().reindex(CONTEXTS)
         los = self.means - gp.min().unstack().unstack().reindex(CONTEXTS)
         ups = gp.max().unstack().unstack().reindex(CONTEXTS) - self.means
         self.errors = dict(
-            (key, dict(
-                (subkey,
-                 [[los[key][subkey][context] for context in self.means.index],
-                  [ups[key][subkey][context] for context in self.means.index]])
-                for subkey in self.means.columns.levels[1]))
-            for key in self.means.columns.levels[0])
+            (
+                key,
+                dict(
+                    (
+                        subkey,
+                        [
+                            [los[key][subkey][context] for context in self.means.index],
+                            [ups[key][subkey][context] for context in self.means.index],
+                        ],
+                    )
+                    for subkey in self.means.columns.levels[1]
+                ),
+            )
+            for key in self.means.columns.levels[0]
+        )
         self.get_perfs(param)
         self.plot_detail(param)
 
-        gp = self.df.groupby(['context', param])['time']
+        gp = self.df.groupby(["context", param])["time"]
         self.means = gp.mean().unstack().reindex(CONTEXTS)
         los = self.means - gp.min().unstack().reindex(CONTEXTS)
         ups = gp.max().unstack().reindex(CONTEXTS) - self.means
         self.errors = [
-            [[los[key][context] for context in self.means.index],
-             [ups[key][context] for context in self.means.index]]
-            for key in self.means]
+            [
+                [los[key][context] for context in self.means.index],
+                [ups[key][context] for context in self.means.index],
+            ]
+            for key in self.means
+        ]
         self.plot_general(param)
 
     def get_perfs(self, param):
-        with io.open(os.path.join(RESULTS_PATH, param + '_results.rst'),
-                     'w') as f:
+        with io.open(os.path.join(RESULTS_PATH, param + "_results.rst"), "w") as f:
             for v in self.means.columns.levels[0]:
                 g = self.means[v].mean(axis=1)
-                perf = ('%s is %.1f× slower then %.1f× faster'
-                        % (v.ljust(10), g[CONTEXTS[1]] / g[CONTEXTS[0]],
-                           g[CONTEXTS[0]] / g[CONTEXTS[2]]))
+                perf = "%s is %.1f× slower then %.1f× faster" % (
+                    v.ljust(10),
+                    g[CONTEXTS[1]] / g[CONTEXTS[0]],
+                    g[CONTEXTS[0]] / g[CONTEXTS[2]],
+                )
                 print(perf)
-                f.write('- %s\n' % perf)
+                f.write("- %s\n" % perf)
 
     def plot_detail(self, param):
         for v in self.means.columns.levels[0]:
             plt.figure()
             axes = self.means[v].plot(
-                kind='barh', xerr=self.errors[v],
-                xlim=self.xlim, figsize=(15, 15), subplots=True, layout=(6, 2),
-                sharey=True, legend=False)
+                kind="barh",
+                xerr=self.errors[v],
+                xlim=self.xlim,
+                figsize=(15, 15),
+                subplots=True,
+                layout=(6, 2),
+                sharey=True,
+                legend=False,
+            )
             plt.gca().invert_yaxis()
             for row in axes:
                 for ax in row:
                     ax.xaxis.grid(True)
-                    ax.set_ylabel('')
-                    ax.set_xlabel('Time (s)')
-            plt.savefig(os.path.join(RESULTS_PATH, '%s_%s.svg' % (param, v)))
+                    ax.set_ylabel("")
+                    ax.set_xlabel("Time (s)")
+            plt.savefig(os.path.join(RESULTS_PATH, "%s_%s.svg" % (param, v)))
 
     def plot_general(self, param):
         plt.figure()
-        ax = self.means.plot(kind='barh', xerr=self.errors, xlim=self.xlim)
+        ax = self.means.plot(kind="barh", xerr=self.errors, xlim=self.xlim)
         ax.invert_yaxis()
         ax.xaxis.grid(True)
-        ax.set_ylabel('')
-        ax.set_xlabel('Time (s)')
-        plt.savefig(os.path.join(RESULTS_PATH, '%s.svg' % param))
+        ax.set_ylabel("")
+        ax.set_xlabel("Time (s)")
+        plt.savefig(os.path.join(RESULTS_PATH, "%s.svg" % param))
 
 
 def create_data(using):
     User.objects.using(using).bulk_create(
-        [User(username='user%d' % i) for i in range(50)])
+        [User(username="user%d" % i) for i in range(50)]
+    )
     Group.objects.using(using).bulk_create(
-        [Group(name='test%d' % i) for i in range(10)])
+        [Group(name="test%d" % i) for i in range(10)]
+    )
     groups = list(Group.objects.using(using))
     for u in User.objects.using(using):
         u.groups.add(choice(groups), choice(groups))
     users = list(User.objects.using(using))
     Test.objects.using(using).bulk_create(
-        [Test(name='test%d' % i, owner=choice(users)) for i in range(10000)])
+        [Test(name="test%d" % i, owner=choice(users)) for i in range(10000)]
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     if not os.path.exists(RESULTS_PATH):
         os.mkdir(RESULTS_PATH)
 
@@ -293,7 +353,7 @@ if __name__ == '__main__':
     old_db_names = {}
     for alias in connections:
         conn = connections[alias]
-        old_db_names[alias] = conn.settings_dict['NAME']
+        old_db_names[alias] = conn.settings_dict["NAME"]
         conn.creation.create_test_db(autoclobber=True)
 
         print("Populating %s…" % connections[alias].vendor)
diff --git a/docs/benchmark.rst b/docs/benchmark.rst
index a679587..6dc6ca0 100644
--- a/docs/benchmark.rst
+++ b/docs/benchmark.rst
@@ -18,6 +18,17 @@ use unoptimised queries. Of course, they often lack useful indexes
 But what you may not know is that
 **the ORM currently generates totally unoptimised queries** [#]_.
 
+You can run the benchmarks yourself (officially supported on Linux
+and Mac). You will need a database called "cachalot" on MySQL and PostgreSQL.
+Additionally, on PostgreSQL, you will need to create a role
+called "cachalot." You can also run the benchmark, and it'll raise
+errors with specific instructions for how to fix it.
+
+#. Install: ``pip install -r requirements/benchmark.txt``
+#. Run: ``python benchmark.py``
+
+The output will be in benchmark/TODAY'S_DATE/
+
 Conditions
 ..........