Support benchmarks for MacOS

* Added how to run benchmarks in docs and README
This commit is contained in:
Andrew-Chen-Wang 2020-07-09 18:33:05 -04:00
parent b80195fb00
commit 8a55454557
3 changed files with 224 additions and 126 deletions

View file

@ -22,22 +22,33 @@ Documentation: http://django-cachalot.readthedocs.io
.. image:: https://img.shields.io/badge/cachalot-Chat%20on%20Slack-green?style=flat&logo=slack
:target: https://join.slack.com/t/cachalotdjango/shared_invite/zt-dd0tj27b-cIH6VlaSOjAWnTG~II5~qw
----
Table of Contents:
- Quickstart
- Usage
- Hacking
- Benchmark
- Third-Party Cache Comparison
- Discussion
Quickstart
----------
Cachalot officially supports Python 3.5-3.8 and Django 2.0-2.2, 3.0 with the databases PostgreSQL, SQLite, and MySQL.
Usage
.....
-----
#. ``pip install django-cachalot``
#. Add ``'cachalot',`` to your ``INSTALLED_APPS``
#. If you use multiple servers with a common cache server,
:ref:`double check their clock synchronisation <https://django-cachalot.readthedocs.io/en/latest/limits.html#multiple-servers>`_
`double check their clock synchronisation <https://django-cachalot.readthedocs.io/en/latest/limits.html#multiple-servers>`_
#. If you modify data outside Django
 typically after restoring a SQL database ,
use the :ref:`manage.py command <https://django-cachalot.readthedocs.io/en/latest/quickstart.html#command>`_
#. Be aware of :ref:`the few other limits <https://django-cachalot.readthedocs.io/en/latest/limits.html#limits>`_
use the `manage.py command <https://django-cachalot.readthedocs.io/en/latest/quickstart.html#command>`_
#. Be aware of `the few other limits <https://django-cachalot.readthedocs.io/en/latest/limits.html#limits>`_
#. If you use
`django-debug-toolbar <https://github.com/jazzband/django-debug-toolbar>`_,
you can add ``'cachalot.panels.CachalotPanel',``
@ -63,6 +74,22 @@ For setup:
#. For PostgreSQL: ``CREATE ROLE cachalot LOGIN SUPERUSER;``
#. Run: ``tox --current-env`` to run the test suite on your current Python version.
Benchmark
---------
Currently, benchmarks are supported on Linux and Mac/Darwin.
You will need a database called "cachalot" on MySQL and PostgreSQL.
Additionally, on PostgreSQL, you will need to create a role
called "cachalot." You can also run the benchmark, and it'll raise
errors with specific instructions for how to fix it.
#. Install: ``pip install -r requirements/benchmark.txt``
#. Run: ``python benchmark.py``
The output will be in benchmark/TODAY'S_DATE/
TODO Create Docker-compose file to allow for easier running of data.
Third-Party Cache Comparison
----------------------------

View file

@ -1,48 +1,49 @@
from collections import OrderedDict
import io
import os
import platform
from random import choice
import re
import sqlite3
from collections import OrderedDict
from datetime import datetime
from random import choice
from subprocess import check_output
from time import time
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings")
import django
django.setup()
from django.conf import settings
from django.contrib.auth.models import User, Group
from django.core.cache import caches
from django.db import connections, connection
from django.test.utils import CaptureQueriesContext, override_settings
from django.utils.encoding import force_text
import matplotlib.pyplot as plt
import _mysql
import pandas as pd
import psycopg2
from django.conf import settings
from django.contrib.auth.models import Group, User
from django.core.cache import caches
from django.db import connection, connections
from django.test.utils import CaptureQueriesContext, override_settings
from django.utils.encoding import force_text
from MySQLdb import _mysql
import cachalot
from cachalot.api import invalidate
from cachalot.tests.models import Test
RESULTS_PATH = 'benchmark/'
DATA_PATH = '/var/lib/'
CONTEXTS = ('Control', 'Cold cache', 'Hot cache')
DIVIDER = 'divider'
RESULTS_PATH = f"benchmark/docs/{datetime.now().date()}/"
CONTEXTS = ("Control", "Cold cache", "Hot cache")
DIVIDER = "divider"
LINUX_DATA_PATH = "/var/lib/"
DISK_DATA_RE = re.compile(r'^MODEL="(.*)" MOUNTPOINT="(.*)"$')
def get_disk_model_for_path(path):
out = force_text(check_output(['lsblk', '-Po', 'MODEL,MOUNTPOINT']))
def get_disk_model_for_path_linux(path):
out = force_text(check_output(["lsblk", "-Po", "MODEL,MOUNTPOINT"]))
mount_points = []
previous_model = None
for model, mount_point in [DISK_DATA_RE.match(line).groups()
for line in out.split('\n') if line]:
for model, mount_point in [
DISK_DATA_RE.match(line).groups() for line in out.split("\n") if line
]:
if model:
previous_model = model.strip()
if mount_point:
@ -55,65 +56,99 @@ def get_disk_model_for_path(path):
def write_conditions():
versions = OrderedDict()
distribution = platform.uname()
# CPU
with open('/proc/cpuinfo') as f:
versions['CPU'] = re.search(r'^model name\s+: (.+)$', f.read(),
flags=re.MULTILINE).group(1)
# RAM
with open('/proc/meminfo') as f:
versions['RAM'] = re.search(r'^MemTotal:\s+(.+)$', f.read(),
flags=re.MULTILINE).group(1)
versions.update((
('Disk', get_disk_model_for_path(DATA_PATH)),
))
# OS
linux_dist = ' '.join(platform.linux_distribution()).strip()
if linux_dist:
versions['Linux distribution'] = linux_dist
# Linux
if distribution.system == "Linux":
# CPU
with open("/proc/cpuinfo") as f:
versions["CPU"] = re.search(
r"^model name\s+: (.+)$", f.read(), flags=re.MULTILINE
).group(1)
# RAM
with open("/proc/meminfo") as f:
versions["RAM"] = re.search(
r"^MemTotal:\s+(.+)$", f.read(), flags=re.MULTILINE
).group(1)
# Disk Model
versions.update((("Disk", get_disk_model_for_path_linux(LINUX_DATA_PATH)),))
# OS
versions["Linux distribution"] = f"{distribution.system} {distribution.release}"
# Darwin
else:
versions['OS'] = platform.system() + ' ' + platform.release()
# CPU
versions["CPU"] = os.popen("sysctl -n machdep.cpu.brand_string").read().rstrip("\n")
# RAM
versions["RAM"] = os.popen("sysctl -n hw.memsize").read().rstrip("\n")
# Disk Model
versions["DISK"] = os.popen(
"diskutil info /dev/disk0 | grep 'Device / Media Name'"
).read().split(":")[1].rstrip("\n").lstrip(" ")
# OS
versions["OS"] = f"{distribution.system} {distribution.release}"
versions.update((
('Python', platform.python_version()),
('Django', django.__version__),
('cachalot', cachalot.__version__),
('sqlite', sqlite3.sqlite_version),
))
versions.update(
(
("Python", platform.python_version()),
("Django", django.__version__),
("cachalot", cachalot.__version__),
("sqlite", sqlite3.sqlite_version),
)
)
# PostgreSQL
with connections['postgresql'].cursor() as cursor:
cursor.execute('SELECT version();')
versions['PostgreSQL'] = re.match(r'^PostgreSQL\s+(\S+)\s',
cursor.fetchone()[0]).group(1)
try:
with connections["postgresql"].cursor() as cursor:
cursor.execute("SELECT version();")
versions["PostgreSQL"] = re.match(
r"^PostgreSQL\s+(\S+)\s", cursor.fetchone()[0]
).group(1)
except django.db.utils.OperationalError:
raise django.db.utils.OperationalError(
"You need a PostgreSQL DB called \"cachalot\" first. "
"Login with \"psql -U postgres -h localhost\" and run: "
"CREATE DATABASE cachalot;"
)
# MySQL
with connections['mysql'].cursor() as cursor:
cursor.execute('SELECT version();')
versions['MySQL'] = cursor.fetchone()[0].split('-')[0]
try:
with connections["mysql"].cursor() as cursor:
cursor.execute("SELECT version();")
versions["MySQL"] = cursor.fetchone()[0].split("-")[0]
except django.db.utils.OperationalError:
raise django.db.utils.OperationalError(
"You need a MySQL DB called \"cachalot\" first. "
"Login with \"mysql -u root\" and run: CREATE DATABASE cachalot;"
)
# Redis
out = force_text(
check_output(['redis-cli', 'INFO', 'server'])).replace('\r', '')
versions['Redis'] = re.search(r'^redis_version:([\d\.]+)$', out,
flags=re.MULTILINE).group(1)
out = force_text(check_output(["redis-cli", "INFO", "server"])).replace("\r", "")
versions["Redis"] = re.search(
r"^redis_version:([\d\.]+)$", out, flags=re.MULTILINE
).group(1)
# memcached
out = force_text(check_output(['memcached', '-h']))
versions['memcached'] = re.match(r'^memcached ([\d\.]+)$', out,
flags=re.MULTILINE).group(1)
out = force_text(check_output(["memcached", "-h"]))
versions["memcached"] = re.match(
r"^memcached ([\d\.]+)$", out, flags=re.MULTILINE
).group(1)
versions.update((
('psycopg2', psycopg2.__version__.split()[0]),
('mysqlclient', _mysql.__version__),
))
versions.update(
(
("psycopg2", psycopg2.__version__.split()[0]),
("mysqlclient", _mysql.__version__),
)
)
with io.open(os.path.join('benchmark', 'conditions.rst'), 'w') as f:
f.write('In this benchmark, a small database is generated, '
'and each test is executed %s times '
'under the following conditions:\n\n' % Benchmark.n)
with io.open(os.path.join(RESULTS_PATH, "conditions.rst"), "w") as f:
f.write(
"In this benchmark, a small database is generated, "
"and each test is executed %s times "
"under the following conditions:\n\n" % Benchmark.n
)
def write_table_sep(char="="):
f.write((char * 20) + " " + (char * 50) + "\n")
def write_table_sep(char='='):
f.write((char * 20) + ' ' + (char * 50) + '\n')
write_table_sep()
for k, v in versions.items():
f.write(k.ljust(20) + ' ' + v + '\n')
f.write(k.ljust(20) + " " + v + "\n")
write_table_sep()
@ -131,8 +166,10 @@ class AssertNumQueries(CaptureQueriesContext):
def __exit__(self, exc_type, exc_val, exc_tb):
super(AssertNumQueries, self).__exit__(exc_type, exc_val, exc_tb)
if len(self) != self.n:
print('The amount of queries should be %s, but %s were captured.'
% (self.n, len(self)))
print(
"The amount of queries should be %s, but %s were captured."
% (self.n, len(self))
)
class Benchmark(object):
@ -150,11 +187,14 @@ class Benchmark(object):
self.query_function(self.db_alias)
end = time()
self.data.append(
{'query': self.query_name,
'time': end - start,
'context': context,
'db': self.db_vendor,
'cache': self.cache_name})
{
"query": self.query_name,
"time": end - start,
"context": context,
"db": self.db_vendor,
"cache": self.cache_name,
}
)
def benchmark(self, query_str, to_list=True, num_queries=1):
# Clears the cache before a single benchmark to ensure the same
@ -162,10 +202,10 @@ class Benchmark(object):
caches[settings.CACHALOT_CACHE].clear()
self.query_name = query_str
query_str = 'Test.objects.using(using)' + query_str
query_str = "Test.objects.using(using)" + query_str
if to_list:
query_str = 'list(%s)' % query_str
self.query_function = eval('lambda using: ' + query_str)
query_str = "list(%s)" % query_str
self.query_function = eval("lambda using: " + query_str)
with override_settings(CACHALOT_ENABLED=False):
self.bench_once(CONTEXTS[0], num_queries)
@ -175,26 +215,24 @@ class Benchmark(object):
self.bench_once(CONTEXTS[2], 0)
def execute_benchmark(self):
self.benchmark('.count()', to_list=False)
self.benchmark('.first()', to_list=False)
self.benchmark('[:10]')
self.benchmark('[5000:5010]')
self.benchmark(".count()", to_list=False)
self.benchmark(".first()", to_list=False)
self.benchmark("[:10]")
self.benchmark("[5000:5010]")
self.benchmark(".filter(name__icontains='e')[0:10]")
self.benchmark(".filter(name__icontains='e')[5000:5010]")
self.benchmark(".order_by('owner')[0:10]")
self.benchmark(".order_by('owner')[5000:5010]")
self.benchmark(".select_related('owner')[0:10]")
self.benchmark(".select_related('owner')[5000:5010]")
self.benchmark(".prefetch_related('owner__groups')[0:10]",
num_queries=3)
self.benchmark(".prefetch_related('owner__groups')[5000:5010]",
num_queries=3)
self.benchmark(".prefetch_related('owner__groups')[0:10]", num_queries=3)
self.benchmark(".prefetch_related('owner__groups')[5000:5010]", num_queries=3)
def run(self):
for db_alias in settings.DATABASES:
self.db_alias = db_alias
self.db_vendor = connections[self.db_alias].vendor
print('Benchmarking %s' % self.db_vendor)
print("Benchmarking %s" % self.db_vendor)
for cache_alias in settings.CACHES:
cache = caches[cache_alias]
self.cache_name = cache.__class__.__name__[:-5].lower()
@ -204,87 +242,109 @@ class Benchmark(object):
self.df = pd.DataFrame.from_records(self.data)
if not os.path.exists(RESULTS_PATH):
os.mkdir(RESULTS_PATH)
self.df.to_csv(os.path.join(RESULTS_PATH, 'data.csv'))
self.df.to_csv(os.path.join(RESULTS_PATH, "data.csv"))
self.xlim = (0, self.df['time'].max() * 1.01)
self.output('db')
self.output('cache')
self.xlim = (0, self.df["time"].max() * 1.01)
self.output("db")
self.output("cache")
def output(self, param):
gp = self.df.groupby(['context', 'query', param])['time']
gp = self.df.groupby(["context", "query", param])["time"]
self.means = gp.mean().unstack().unstack().reindex(CONTEXTS)
los = self.means - gp.min().unstack().unstack().reindex(CONTEXTS)
ups = gp.max().unstack().unstack().reindex(CONTEXTS) - self.means
self.errors = dict(
(key, dict(
(subkey,
[[los[key][subkey][context] for context in self.means.index],
[ups[key][subkey][context] for context in self.means.index]])
for subkey in self.means.columns.levels[1]))
for key in self.means.columns.levels[0])
(
key,
dict(
(
subkey,
[
[los[key][subkey][context] for context in self.means.index],
[ups[key][subkey][context] for context in self.means.index],
],
)
for subkey in self.means.columns.levels[1]
),
)
for key in self.means.columns.levels[0]
)
self.get_perfs(param)
self.plot_detail(param)
gp = self.df.groupby(['context', param])['time']
gp = self.df.groupby(["context", param])["time"]
self.means = gp.mean().unstack().reindex(CONTEXTS)
los = self.means - gp.min().unstack().reindex(CONTEXTS)
ups = gp.max().unstack().reindex(CONTEXTS) - self.means
self.errors = [
[[los[key][context] for context in self.means.index],
[ups[key][context] for context in self.means.index]]
for key in self.means]
[
[los[key][context] for context in self.means.index],
[ups[key][context] for context in self.means.index],
]
for key in self.means
]
self.plot_general(param)
def get_perfs(self, param):
with io.open(os.path.join(RESULTS_PATH, param + '_results.rst'),
'w') as f:
with io.open(os.path.join(RESULTS_PATH, param + "_results.rst"), "w") as f:
for v in self.means.columns.levels[0]:
g = self.means[v].mean(axis=1)
perf = ('%s is %.1f× slower then %.1f× faster'
% (v.ljust(10), g[CONTEXTS[1]] / g[CONTEXTS[0]],
g[CONTEXTS[0]] / g[CONTEXTS[2]]))
perf = "%s is %.1f× slower then %.1f× faster" % (
v.ljust(10),
g[CONTEXTS[1]] / g[CONTEXTS[0]],
g[CONTEXTS[0]] / g[CONTEXTS[2]],
)
print(perf)
f.write('- %s\n' % perf)
f.write("- %s\n" % perf)
def plot_detail(self, param):
for v in self.means.columns.levels[0]:
plt.figure()
axes = self.means[v].plot(
kind='barh', xerr=self.errors[v],
xlim=self.xlim, figsize=(15, 15), subplots=True, layout=(6, 2),
sharey=True, legend=False)
kind="barh",
xerr=self.errors[v],
xlim=self.xlim,
figsize=(15, 15),
subplots=True,
layout=(6, 2),
sharey=True,
legend=False,
)
plt.gca().invert_yaxis()
for row in axes:
for ax in row:
ax.xaxis.grid(True)
ax.set_ylabel('')
ax.set_xlabel('Time (s)')
plt.savefig(os.path.join(RESULTS_PATH, '%s_%s.svg' % (param, v)))
ax.set_ylabel("")
ax.set_xlabel("Time (s)")
plt.savefig(os.path.join(RESULTS_PATH, "%s_%s.svg" % (param, v)))
def plot_general(self, param):
plt.figure()
ax = self.means.plot(kind='barh', xerr=self.errors, xlim=self.xlim)
ax = self.means.plot(kind="barh", xerr=self.errors, xlim=self.xlim)
ax.invert_yaxis()
ax.xaxis.grid(True)
ax.set_ylabel('')
ax.set_xlabel('Time (s)')
plt.savefig(os.path.join(RESULTS_PATH, '%s.svg' % param))
ax.set_ylabel("")
ax.set_xlabel("Time (s)")
plt.savefig(os.path.join(RESULTS_PATH, "%s.svg" % param))
def create_data(using):
User.objects.using(using).bulk_create(
[User(username='user%d' % i) for i in range(50)])
[User(username="user%d" % i) for i in range(50)]
)
Group.objects.using(using).bulk_create(
[Group(name='test%d' % i) for i in range(10)])
[Group(name="test%d" % i) for i in range(10)]
)
groups = list(Group.objects.using(using))
for u in User.objects.using(using):
u.groups.add(choice(groups), choice(groups))
users = list(User.objects.using(using))
Test.objects.using(using).bulk_create(
[Test(name='test%d' % i, owner=choice(users)) for i in range(10000)])
[Test(name="test%d" % i, owner=choice(users)) for i in range(10000)]
)
if __name__ == '__main__':
if __name__ == "__main__":
if not os.path.exists(RESULTS_PATH):
os.mkdir(RESULTS_PATH)
@ -293,7 +353,7 @@ if __name__ == '__main__':
old_db_names = {}
for alias in connections:
conn = connections[alias]
old_db_names[alias] = conn.settings_dict['NAME']
old_db_names[alias] = conn.settings_dict["NAME"]
conn.creation.create_test_db(autoclobber=True)
print("Populating %s" % connections[alias].vendor)

View file

@ -18,6 +18,17 @@ use unoptimised queries. Of course, they often lack useful indexes
But what you may not know is that
**the ORM currently generates totally unoptimised queries** [#]_.
You can run the benchmarks yourself (officially supported on Linux
and Mac). You will need a database called "cachalot" on MySQL and PostgreSQL.
Additionally, on PostgreSQL, you will need to create a role
called "cachalot." You can also run the benchmark, and it'll raise
errors with specific instructions for how to fix it.
#. Install: ``pip install -r requirements/benchmark.txt``
#. Run: ``python benchmark.py``
The output will be in benchmark/TODAY'S_DATE/
Conditions
..........