From 8a55454557cc9a0b41e51c25ef4857abf3db55ca Mon Sep 17 00:00:00 2001 From: Andrew-Chen-Wang Date: Thu, 9 Jul 2020 18:33:05 -0400 Subject: [PATCH] Support benchmarks for MacOS * Added how to run benchmarks in docs and README --- README.rst | 35 +++++- benchmark.py | 304 +++++++++++++++++++++++++++------------------ docs/benchmark.rst | 11 ++ 3 files changed, 224 insertions(+), 126 deletions(-) diff --git a/README.rst b/README.rst index 09cfb29..f355d45 100644 --- a/README.rst +++ b/README.rst @@ -22,22 +22,33 @@ Documentation: http://django-cachalot.readthedocs.io .. image:: https://img.shields.io/badge/cachalot-Chat%20on%20Slack-green?style=flat&logo=slack :target: https://join.slack.com/t/cachalotdjango/shared_invite/zt-dd0tj27b-cIH6VlaSOjAWnTG~II5~qw +---- + +Table of Contents: + +- Quickstart +- Usage +- Hacking +- Benchmark +- Third-Party Cache Comparison +- Discussion + Quickstart ---------- Cachalot officially supports Python 3.5-3.8 and Django 2.0-2.2, 3.0 with the databases PostgreSQL, SQLite, and MySQL. Usage -..... +----- #. ``pip install django-cachalot`` #. Add ``'cachalot',`` to your ``INSTALLED_APPS`` #. If you use multiple servers with a common cache server, - :ref:`double check their clock synchronisation `_ + `double check their clock synchronisation `_ #. If you modify data outside Django – typically after restoring a SQL database –, - use the :ref:`manage.py command `_ -#. Be aware of :ref:`the few other limits `_ + use the `manage.py command `_ +#. Be aware of `the few other limits `_ #. If you use `django-debug-toolbar `_, you can add ``'cachalot.panels.CachalotPanel',`` @@ -63,6 +74,22 @@ For setup: #. For PostgreSQL: ``CREATE ROLE cachalot LOGIN SUPERUSER;`` #. Run: ``tox --current-env`` to run the test suite on your current Python version. +Benchmark +--------- + +Currently, benchmarks are supported on Linux and Mac/Darwin. +You will need a database called "cachalot" on MySQL and PostgreSQL. +Additionally, on PostgreSQL, you will need to create a role +called "cachalot." You can also run the benchmark, and it'll raise +errors with specific instructions for how to fix it. + +#. Install: ``pip install -r requirements/benchmark.txt`` +#. Run: ``python benchmark.py`` + +The output will be in benchmark/TODAY'S_DATE/ + +TODO Create Docker-compose file to allow for easier running of data. + Third-Party Cache Comparison ---------------------------- diff --git a/benchmark.py b/benchmark.py index c240ee5..c7ebd57 100755 --- a/benchmark.py +++ b/benchmark.py @@ -1,48 +1,49 @@ -from collections import OrderedDict import io import os import platform -from random import choice import re import sqlite3 +from collections import OrderedDict +from datetime import datetime +from random import choice from subprocess import check_output from time import time - -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings') - +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings") import django django.setup() -from django.conf import settings -from django.contrib.auth.models import User, Group -from django.core.cache import caches -from django.db import connections, connection -from django.test.utils import CaptureQueriesContext, override_settings -from django.utils.encoding import force_text import matplotlib.pyplot as plt -import _mysql import pandas as pd import psycopg2 +from django.conf import settings +from django.contrib.auth.models import Group, User +from django.core.cache import caches +from django.db import connection, connections +from django.test.utils import CaptureQueriesContext, override_settings +from django.utils.encoding import force_text +from MySQLdb import _mysql import cachalot from cachalot.api import invalidate from cachalot.tests.models import Test -RESULTS_PATH = 'benchmark/' -DATA_PATH = '/var/lib/' -CONTEXTS = ('Control', 'Cold cache', 'Hot cache') -DIVIDER = 'divider' +RESULTS_PATH = f"benchmark/docs/{datetime.now().date()}/" +CONTEXTS = ("Control", "Cold cache", "Hot cache") +DIVIDER = "divider" + +LINUX_DATA_PATH = "/var/lib/" DISK_DATA_RE = re.compile(r'^MODEL="(.*)" MOUNTPOINT="(.*)"$') -def get_disk_model_for_path(path): - out = force_text(check_output(['lsblk', '-Po', 'MODEL,MOUNTPOINT'])) +def get_disk_model_for_path_linux(path): + out = force_text(check_output(["lsblk", "-Po", "MODEL,MOUNTPOINT"])) mount_points = [] previous_model = None - for model, mount_point in [DISK_DATA_RE.match(line).groups() - for line in out.split('\n') if line]: + for model, mount_point in [ + DISK_DATA_RE.match(line).groups() for line in out.split("\n") if line + ]: if model: previous_model = model.strip() if mount_point: @@ -55,65 +56,99 @@ def get_disk_model_for_path(path): def write_conditions(): versions = OrderedDict() + distribution = platform.uname() - # CPU - with open('/proc/cpuinfo') as f: - versions['CPU'] = re.search(r'^model name\s+: (.+)$', f.read(), - flags=re.MULTILINE).group(1) - # RAM - with open('/proc/meminfo') as f: - versions['RAM'] = re.search(r'^MemTotal:\s+(.+)$', f.read(), - flags=re.MULTILINE).group(1) - versions.update(( - ('Disk', get_disk_model_for_path(DATA_PATH)), - )) - # OS - linux_dist = ' '.join(platform.linux_distribution()).strip() - if linux_dist: - versions['Linux distribution'] = linux_dist + # Linux + if distribution.system == "Linux": + # CPU + with open("/proc/cpuinfo") as f: + versions["CPU"] = re.search( + r"^model name\s+: (.+)$", f.read(), flags=re.MULTILINE + ).group(1) + # RAM + with open("/proc/meminfo") as f: + versions["RAM"] = re.search( + r"^MemTotal:\s+(.+)$", f.read(), flags=re.MULTILINE + ).group(1) + # Disk Model + versions.update((("Disk", get_disk_model_for_path_linux(LINUX_DATA_PATH)),)) + # OS + versions["Linux distribution"] = f"{distribution.system} {distribution.release}" + # Darwin else: - versions['OS'] = platform.system() + ' ' + platform.release() + # CPU + versions["CPU"] = os.popen("sysctl -n machdep.cpu.brand_string").read().rstrip("\n") + # RAM + versions["RAM"] = os.popen("sysctl -n hw.memsize").read().rstrip("\n") + # Disk Model + versions["DISK"] = os.popen( + "diskutil info /dev/disk0 | grep 'Device / Media Name'" + ).read().split(":")[1].rstrip("\n").lstrip(" ") + # OS + versions["OS"] = f"{distribution.system} {distribution.release}" - versions.update(( - ('Python', platform.python_version()), - ('Django', django.__version__), - ('cachalot', cachalot.__version__), - ('sqlite', sqlite3.sqlite_version), - )) + versions.update( + ( + ("Python", platform.python_version()), + ("Django", django.__version__), + ("cachalot", cachalot.__version__), + ("sqlite", sqlite3.sqlite_version), + ) + ) # PostgreSQL - with connections['postgresql'].cursor() as cursor: - cursor.execute('SELECT version();') - versions['PostgreSQL'] = re.match(r'^PostgreSQL\s+(\S+)\s', - cursor.fetchone()[0]).group(1) + try: + with connections["postgresql"].cursor() as cursor: + cursor.execute("SELECT version();") + versions["PostgreSQL"] = re.match( + r"^PostgreSQL\s+(\S+)\s", cursor.fetchone()[0] + ).group(1) + except django.db.utils.OperationalError: + raise django.db.utils.OperationalError( + "You need a PostgreSQL DB called \"cachalot\" first. " + "Login with \"psql -U postgres -h localhost\" and run: " + "CREATE DATABASE cachalot;" + ) # MySQL - with connections['mysql'].cursor() as cursor: - cursor.execute('SELECT version();') - versions['MySQL'] = cursor.fetchone()[0].split('-')[0] + try: + with connections["mysql"].cursor() as cursor: + cursor.execute("SELECT version();") + versions["MySQL"] = cursor.fetchone()[0].split("-")[0] + except django.db.utils.OperationalError: + raise django.db.utils.OperationalError( + "You need a MySQL DB called \"cachalot\" first. " + "Login with \"mysql -u root\" and run: CREATE DATABASE cachalot;" + ) # Redis - out = force_text( - check_output(['redis-cli', 'INFO', 'server'])).replace('\r', '') - versions['Redis'] = re.search(r'^redis_version:([\d\.]+)$', out, - flags=re.MULTILINE).group(1) + out = force_text(check_output(["redis-cli", "INFO", "server"])).replace("\r", "") + versions["Redis"] = re.search( + r"^redis_version:([\d\.]+)$", out, flags=re.MULTILINE + ).group(1) # memcached - out = force_text(check_output(['memcached', '-h'])) - versions['memcached'] = re.match(r'^memcached ([\d\.]+)$', out, - flags=re.MULTILINE).group(1) + out = force_text(check_output(["memcached", "-h"])) + versions["memcached"] = re.match( + r"^memcached ([\d\.]+)$", out, flags=re.MULTILINE + ).group(1) - versions.update(( - ('psycopg2', psycopg2.__version__.split()[0]), - ('mysqlclient', _mysql.__version__), - )) + versions.update( + ( + ("psycopg2", psycopg2.__version__.split()[0]), + ("mysqlclient", _mysql.__version__), + ) + ) - with io.open(os.path.join('benchmark', 'conditions.rst'), 'w') as f: - f.write('In this benchmark, a small database is generated, ' - 'and each test is executed %s times ' - 'under the following conditions:\n\n' % Benchmark.n) + with io.open(os.path.join(RESULTS_PATH, "conditions.rst"), "w") as f: + f.write( + "In this benchmark, a small database is generated, " + "and each test is executed %s times " + "under the following conditions:\n\n" % Benchmark.n + ) + + def write_table_sep(char="="): + f.write((char * 20) + " " + (char * 50) + "\n") - def write_table_sep(char='='): - f.write((char * 20) + ' ' + (char * 50) + '\n') write_table_sep() for k, v in versions.items(): - f.write(k.ljust(20) + ' ' + v + '\n') + f.write(k.ljust(20) + " " + v + "\n") write_table_sep() @@ -131,8 +166,10 @@ class AssertNumQueries(CaptureQueriesContext): def __exit__(self, exc_type, exc_val, exc_tb): super(AssertNumQueries, self).__exit__(exc_type, exc_val, exc_tb) if len(self) != self.n: - print('The amount of queries should be %s, but %s were captured.' - % (self.n, len(self))) + print( + "The amount of queries should be %s, but %s were captured." + % (self.n, len(self)) + ) class Benchmark(object): @@ -150,11 +187,14 @@ class Benchmark(object): self.query_function(self.db_alias) end = time() self.data.append( - {'query': self.query_name, - 'time': end - start, - 'context': context, - 'db': self.db_vendor, - 'cache': self.cache_name}) + { + "query": self.query_name, + "time": end - start, + "context": context, + "db": self.db_vendor, + "cache": self.cache_name, + } + ) def benchmark(self, query_str, to_list=True, num_queries=1): # Clears the cache before a single benchmark to ensure the same @@ -162,10 +202,10 @@ class Benchmark(object): caches[settings.CACHALOT_CACHE].clear() self.query_name = query_str - query_str = 'Test.objects.using(using)' + query_str + query_str = "Test.objects.using(using)" + query_str if to_list: - query_str = 'list(%s)' % query_str - self.query_function = eval('lambda using: ' + query_str) + query_str = "list(%s)" % query_str + self.query_function = eval("lambda using: " + query_str) with override_settings(CACHALOT_ENABLED=False): self.bench_once(CONTEXTS[0], num_queries) @@ -175,26 +215,24 @@ class Benchmark(object): self.bench_once(CONTEXTS[2], 0) def execute_benchmark(self): - self.benchmark('.count()', to_list=False) - self.benchmark('.first()', to_list=False) - self.benchmark('[:10]') - self.benchmark('[5000:5010]') + self.benchmark(".count()", to_list=False) + self.benchmark(".first()", to_list=False) + self.benchmark("[:10]") + self.benchmark("[5000:5010]") self.benchmark(".filter(name__icontains='e')[0:10]") self.benchmark(".filter(name__icontains='e')[5000:5010]") self.benchmark(".order_by('owner')[0:10]") self.benchmark(".order_by('owner')[5000:5010]") self.benchmark(".select_related('owner')[0:10]") self.benchmark(".select_related('owner')[5000:5010]") - self.benchmark(".prefetch_related('owner__groups')[0:10]", - num_queries=3) - self.benchmark(".prefetch_related('owner__groups')[5000:5010]", - num_queries=3) + self.benchmark(".prefetch_related('owner__groups')[0:10]", num_queries=3) + self.benchmark(".prefetch_related('owner__groups')[5000:5010]", num_queries=3) def run(self): for db_alias in settings.DATABASES: self.db_alias = db_alias self.db_vendor = connections[self.db_alias].vendor - print('Benchmarking %s…' % self.db_vendor) + print("Benchmarking %s…" % self.db_vendor) for cache_alias in settings.CACHES: cache = caches[cache_alias] self.cache_name = cache.__class__.__name__[:-5].lower() @@ -204,87 +242,109 @@ class Benchmark(object): self.df = pd.DataFrame.from_records(self.data) if not os.path.exists(RESULTS_PATH): os.mkdir(RESULTS_PATH) - self.df.to_csv(os.path.join(RESULTS_PATH, 'data.csv')) + self.df.to_csv(os.path.join(RESULTS_PATH, "data.csv")) - self.xlim = (0, self.df['time'].max() * 1.01) - self.output('db') - self.output('cache') + self.xlim = (0, self.df["time"].max() * 1.01) + self.output("db") + self.output("cache") def output(self, param): - gp = self.df.groupby(['context', 'query', param])['time'] + gp = self.df.groupby(["context", "query", param])["time"] self.means = gp.mean().unstack().unstack().reindex(CONTEXTS) los = self.means - gp.min().unstack().unstack().reindex(CONTEXTS) ups = gp.max().unstack().unstack().reindex(CONTEXTS) - self.means self.errors = dict( - (key, dict( - (subkey, - [[los[key][subkey][context] for context in self.means.index], - [ups[key][subkey][context] for context in self.means.index]]) - for subkey in self.means.columns.levels[1])) - for key in self.means.columns.levels[0]) + ( + key, + dict( + ( + subkey, + [ + [los[key][subkey][context] for context in self.means.index], + [ups[key][subkey][context] for context in self.means.index], + ], + ) + for subkey in self.means.columns.levels[1] + ), + ) + for key in self.means.columns.levels[0] + ) self.get_perfs(param) self.plot_detail(param) - gp = self.df.groupby(['context', param])['time'] + gp = self.df.groupby(["context", param])["time"] self.means = gp.mean().unstack().reindex(CONTEXTS) los = self.means - gp.min().unstack().reindex(CONTEXTS) ups = gp.max().unstack().reindex(CONTEXTS) - self.means self.errors = [ - [[los[key][context] for context in self.means.index], - [ups[key][context] for context in self.means.index]] - for key in self.means] + [ + [los[key][context] for context in self.means.index], + [ups[key][context] for context in self.means.index], + ] + for key in self.means + ] self.plot_general(param) def get_perfs(self, param): - with io.open(os.path.join(RESULTS_PATH, param + '_results.rst'), - 'w') as f: + with io.open(os.path.join(RESULTS_PATH, param + "_results.rst"), "w") as f: for v in self.means.columns.levels[0]: g = self.means[v].mean(axis=1) - perf = ('%s is %.1f× slower then %.1f× faster' - % (v.ljust(10), g[CONTEXTS[1]] / g[CONTEXTS[0]], - g[CONTEXTS[0]] / g[CONTEXTS[2]])) + perf = "%s is %.1f× slower then %.1f× faster" % ( + v.ljust(10), + g[CONTEXTS[1]] / g[CONTEXTS[0]], + g[CONTEXTS[0]] / g[CONTEXTS[2]], + ) print(perf) - f.write('- %s\n' % perf) + f.write("- %s\n" % perf) def plot_detail(self, param): for v in self.means.columns.levels[0]: plt.figure() axes = self.means[v].plot( - kind='barh', xerr=self.errors[v], - xlim=self.xlim, figsize=(15, 15), subplots=True, layout=(6, 2), - sharey=True, legend=False) + kind="barh", + xerr=self.errors[v], + xlim=self.xlim, + figsize=(15, 15), + subplots=True, + layout=(6, 2), + sharey=True, + legend=False, + ) plt.gca().invert_yaxis() for row in axes: for ax in row: ax.xaxis.grid(True) - ax.set_ylabel('') - ax.set_xlabel('Time (s)') - plt.savefig(os.path.join(RESULTS_PATH, '%s_%s.svg' % (param, v))) + ax.set_ylabel("") + ax.set_xlabel("Time (s)") + plt.savefig(os.path.join(RESULTS_PATH, "%s_%s.svg" % (param, v))) def plot_general(self, param): plt.figure() - ax = self.means.plot(kind='barh', xerr=self.errors, xlim=self.xlim) + ax = self.means.plot(kind="barh", xerr=self.errors, xlim=self.xlim) ax.invert_yaxis() ax.xaxis.grid(True) - ax.set_ylabel('') - ax.set_xlabel('Time (s)') - plt.savefig(os.path.join(RESULTS_PATH, '%s.svg' % param)) + ax.set_ylabel("") + ax.set_xlabel("Time (s)") + plt.savefig(os.path.join(RESULTS_PATH, "%s.svg" % param)) def create_data(using): User.objects.using(using).bulk_create( - [User(username='user%d' % i) for i in range(50)]) + [User(username="user%d" % i) for i in range(50)] + ) Group.objects.using(using).bulk_create( - [Group(name='test%d' % i) for i in range(10)]) + [Group(name="test%d" % i) for i in range(10)] + ) groups = list(Group.objects.using(using)) for u in User.objects.using(using): u.groups.add(choice(groups), choice(groups)) users = list(User.objects.using(using)) Test.objects.using(using).bulk_create( - [Test(name='test%d' % i, owner=choice(users)) for i in range(10000)]) + [Test(name="test%d" % i, owner=choice(users)) for i in range(10000)] + ) -if __name__ == '__main__': +if __name__ == "__main__": if not os.path.exists(RESULTS_PATH): os.mkdir(RESULTS_PATH) @@ -293,7 +353,7 @@ if __name__ == '__main__': old_db_names = {} for alias in connections: conn = connections[alias] - old_db_names[alias] = conn.settings_dict['NAME'] + old_db_names[alias] = conn.settings_dict["NAME"] conn.creation.create_test_db(autoclobber=True) print("Populating %s…" % connections[alias].vendor) diff --git a/docs/benchmark.rst b/docs/benchmark.rst index a679587..6dc6ca0 100644 --- a/docs/benchmark.rst +++ b/docs/benchmark.rst @@ -18,6 +18,17 @@ use unoptimised queries. Of course, they often lack useful indexes But what you may not know is that **the ORM currently generates totally unoptimised queries** [#]_. +You can run the benchmarks yourself (officially supported on Linux +and Mac). You will need a database called "cachalot" on MySQL and PostgreSQL. +Additionally, on PostgreSQL, you will need to create a role +called "cachalot." You can also run the benchmark, and it'll raise +errors with specific instructions for how to fix it. + +#. Install: ``pip install -r requirements/benchmark.txt`` +#. Run: ``python benchmark.py`` + +The output will be in benchmark/TODAY'S_DATE/ + Conditions ..........