create table users (
id serial primary key not null,
created_at timestamp not null default now()
);
create table users_telephones (
user_id int references users(id) not null,
is_primary boolean not null default true,
telephone varchar not null
);
insert into users
select i, NOW() + (random() * (interval '90 days')) + '30 days' from generate_series(1, 10000000) i;
insert into users_telephones select id, true, random() :: text from users limit 10000000; -- all users have a primary telephone
insert into users_telephones select id, false, random() :: text from users limit 200000; -- some users have a non primary telephone
create index on users(created_at);
create index on users_telephones(user_id);
create index on users_telephones(user_id, is_primary) where is_primary;
select count(*) from users;
count
----------
10000000
(1 row)
Time: 160.911 ms
select count(*) from users_telephones;
count
----------
10200000
(1 row)
Time: 176.361 ms
select
*
from
users u
join users_telephones ut on u.id = ut.user_id
where
ut.is_primary
order by
created_at
limit
10;
id | created_at | user_id | is_primary | telephone
---------+----------------------------+---------+------------+--------------------
9017755 | 2023-09-11 11:45:37.65744 | 9017755 | t | 0.7182410419408853
6061687 | 2023-09-11 11:45:39.271054 | 6061687 | t | 0.3608686654204689
9823470 | 2023-09-11 11:45:39.284201 | 9823470 | t | 0.3026398665522869
2622527 | 2023-09-11 11:45:39.919549 | 2622527 | t | 0.1929579716250771
7585920 | 2023-09-11 11:45:40.256742 | 7585920 | t | 0.3830236472843005
5077138 | 2023-09-11 11:45:41.076164 | 5077138 | t | 0.9058939392225689
1496883 | 2023-09-11 11:45:42.459194 | 1496883 | t | 0.1519510558344308
9234364 | 2023-09-11 11:45:42.965896 | 9234364 | t | 0.8254433522266105
6988331 | 2023-09-11 11:45:43.130548 | 6988331 | t | 0.9577098184736457
7916398 | 2023-09-11 11:45:43.559425 | 7916398 | t | 0.9681218675498862
(10 rows)
Time: 0.973 msThis is only fast because 100% of users have a phone number as a primary contact, so the join filter is essentially meaningless. If in the contact table, the filtered number is a small percentage of the total (e.g. most users have an email as their primary contact, not a phone number), but still a good size (e.g. there’s still hundreds of thousands to millions of phone primary contacts), it’s a much harder query.
It’s probably also fast because you have a warm cache - e.g. there’s enough memory for the DB to have the indexes 100% in memory, which is just not feasible with large DBs in the real world, where you can easily have >100GB of indexes + hot data, and the DB can’t keep it all in memory. In most real world scenarios, having to somewhat frequently read pages of indexes off disk, into memory, to satisfy queries, is common.
Try it again, with the exact same data, but:
- Search for users with a non-primary phone contact (you have 200,000 of these, and 10,000,000 users)
- Give the DB say 1/3 the memory of your total index size, so the complete indexes can’t be in memory
- Run the query right after starting PG up, to ensure the cache is cold (with a hot cache, almost everything is fast, but in real world situations with lots of users the cache isn’t consistently hot)
That's the point? Sure there is a scale where it's infeasible, but you can quite easily (albeit it's pricey) get DB instances with hundreds or thousands of GiB of RAM. Even if you can't get everything into it, your working set is often not the size of the entire data set. My company's main MySQL primary has around 800 GiB of storage, and only about 1/3 of that in RAM. Disk read IOPS are usually 0.
Nevertheless, I recreated this in Postgres 15.
The total index size of the two tables is 790 MB according to `\di+*`, so I'll set `shared_buffers` to 263 MB, then restart and re-run.
For reference, time with current settings is about 6.8 msec for `is_primary`, and 1395 msec for `NOT is_primary`.
After a restart, I ran the query for `NOT is_primary` first, which took 1740 msec. The first run of the query with `is_primary` took 21 msec.
My DB is hosted on older hardware, and the files themselves live on NVMe drives presented via Ceph over 1GBe.
EDIT: I forgot that Postgres uses OS cache for a lot of its memory, not just its own. Re-did this, running `sync; sync; sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'` in between shutting down/starting up Postgres. 16726 msec and 79 msec, respectively. So yes, a lot slower, but a. I don't think this is realistic for a production database b. I'm still not clear about how you think JOINs enter into this. The slowdown comes entirely from having to run a table scan.
FWIW, I don’t think joins are bad, I’m 100% for normalized DB schemas with joins. But I’ve done tonnes of performance work over the past ~10 years, and run into a bunch of real world cases where, when caches are cold (which does happen frequently with large datasets and limited budgets), queries similar to the above (join two tables, read a page of data, sorting on one table and filtering on the other) take 10s of seconds, sometimes even minutes with very large datasets. In those cases, the only solution has been to denormalize so we can create a compound index, with filter key(s) as the prefix, sort key as the suffix, which makes the query consistently fast. I am not at all suggesting this as the default, better to default to normalized with joins, and only do this for these specific cases. Generally a company just has one or a few cases where this is necessary, can just do the ugly denormalization + compound indexes for these few hot spots. But when ppl say “joins are slow”, these cases are examples of where it’s true.
Re: your above 17 second query, if you do something like adding fields to the user table for primary and secondary contact method (denormalizing), and then create a compound index with the necessary filter keys first, and the sort key second, I think you’ll find the query (which no longer needs a join) is quite fast even if caches are ice cold.
postgres=# CREATE INDEX sec_phone_created_at ON hn_phone_new (phone_secondary, created_at) WHERE phone_secondary IS NOT NULL;
I reset `shared_buffers` down to the same as before - 263 MB - although the size of this index is tiny, < 10 MB, so realistically I can't shrink buffers down that far anyway. I then did the same `sync/drop cache` as before. postgres=# SELECT * FROM hn_phone_new WHERE phone_secondary IS NOT NULL ORDER BY created_at LIMIT 10;
id | created_at | phone_primary | phone_secondary
--------+---------------------+--------------------+--------------------
58816 | 1995-05-23 03:22:02 | +49 030 522866-87 | +1 159-445-4810
49964 | 1995-05-23 03:23:00 | +61 02 7440 8606 | +254 20 925 892
171828 | 1995-05-23 05:06:47 | +380 32 393-35-89 | +49 030 429376-29
78333 | 1995-05-23 05:31:22 | +380 32 147-11-20 | +52 55 6409 5253
24264 | 1995-05-23 06:47:21 | +44 0131 6506 1823 | +49 030 610965-83
96662 | 1995-05-23 06:57:03 | +52 55 1473 0538 | +61 02 5414 8204
15023 | 1995-05-23 07:55:37 | +44 0131 7959 1581 | +44 0131 8491 6194
52029 | 1995-05-23 08:59:19 | +380 32 430-77-54 | +254 20 374 856
20518 | 1995-05-23 09:51:14 | +380 32 264-21-79 | +52 55 7787 0236
80273 | 1995-05-23 14:59:26 | +61 02 8863 4466 | +33 01 16 10 78 56
(10 rows)
Time: 2258.807 ms (00:02.259)
So yes, significant improvement as you'd expect. I then dropped the index and swapped the order: postgres=# DROP INDEX sec_phone_created_at;
postgres=# CREATE INDEX created_at_sec_phone ON hn_phone_new (created_at, phone_secondary) WHERE phone_secondary IS NOT NULL;
Reset everything as before, and re-ran the same query: Time: 221.392 ms
Thinking that like MySQL, a portion of the `shared_buffers` had been saved to disk and put back in upon restart (honestly I don't know if Postgres does this), I attempted to flush it by running a few `SELECT COUNT(*)` on other, larger tables, then re-running the query. Time: 365.961 ms
This is what `EXPLAIN VERBOSE` looks like for the original index: Limit (cost=8.44..8.45 rows=1 width=61)
Output: id, created_at, phone_primary, phone_secondary
-> Sort (cost=8.44..8.45 rows=1 width=61)
Output: id, created_at, phone_primary, phone_secondary
Sort Key: hn_phone_new.created_at
-> Index Scan using sec_phone_created_at on public.hn_phone_new (cost=0.42..8.43 rows=1 width=61)
Output: id, created_at, phone_primary, phone_secondary
(7 rows)
And this is what it looks like for the second, with the columns swapped: Limit (cost=0.42..8.43 rows=1 width=61)
Output: id, created_at, phone_primary, phone_secondary
-> Index Scan using created_at_sec_phone on public.hn_phone_new (cost=0.42..8.43 rows=1 width=61)
Output: id, created_at, phone_primary, phone_secondary
(4 rows)
So it actually needs to be reversed, so that the query planner doesn't have to add a sort step for the ORDER BY.But, yeah, exactly. Everyone thinks they need to optimise the life out of this stuff at the beginning but the db can do a lot with normalised data and the appropriate indexes.
Side note - is_primary isn’t required in the partial index itself since they’ll all be “true” due to the where clause.
Probably nitpicking but these types of measures are usually tricky to interpret because there is a high chance your indexes (maybe even rows) are still on PostgreSQL shared buffers and OS cache and might not reflect real usage performance.
To get a more "worst-case" measure, after your inserts and indexes creation, you can restart your database server + flush OS pages cache (e.g. drop_caches for Linux), then do the measure.
Sometimes the difference is huge, although I don't suspect it will be in this case.
I did have to do a few manual updates after data load because the aforementioned program can't make foreign keys yet, and also for bools (which MySQL stores as tinyint(1)) I'm randomly generating them via `id & 1`, which isn't what you had.
Also, I gave `hn_phone` its own auto-increment int as a PK, so I could have a non-unique index on `user_id`. In MySQL, if you create a table without a PK, you get one of these, in descending order of precedence:
* The first indexed UNIQUE NOT NULL column promoted to PK
* An invisible, auto-generated, auto-incrementing integer column called `my_row_id` as PK (MySQL >= 8.0.30, if sql_generate_invisible_primary_key=1)
* A hidden index called `GEN_CLUST_INDEX` created on a super-invisible (i.e. doesn't show up in table definition) column called `ROW_ID`, but that column is shared across the entire DB so please don't do this
It's worth noting that since the first 10,000,000 rows all have `is_primary` set, this can finish extremely quickly. If you invert that match with these tables, you have to do a table scan on `hn_phone`, and the time jumps up to about 5650 msec. If you change the `hn_phone` index to be a composite on (`user_id`, `is_primary`) and then rewrite the query to use a subquery instead of a join, the time drops to around 7 msec. You might see a slight speed-up if you index `created_at` in descending order if that was the normal access pattern.
Anyway:
OS: Debian Bullseye 5.10.0-23-amd64
Virtualized: Yes (Proxmox)
CPU: E5-2650 v2 @ 2.60GHz
Allocated Core Count: 16
Allocated RAM: 64 GiB PC3-12800R
Disk: Samsung PM983 1.92 TiB via Ceph
Filesystem: XFS
Mount Options: defaults,noatime
MySQL Version: 8.0.34
MySQL Options (non-default):
innodb_buffer_pool_instances = 16
innodb_buffer_pool_chunk_size = 134217728
innodb_buffer_pool_size = 17179869184
innodb_numa_interleave = 1
innodb_sync_array_size = 16 # this shouldn't apply here, but listing anyway
innodb_flush_method = O_DIRECT
innodb_read_io_threads = 16
innodb_write_io_threads = 16 # this shouldn't apply here, but listing anyway
CREATE TABLE `hn_user` (
`id` int unsigned NOT NULL AUTO_INCREMENT,
`created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
KEY `user_created_at` (`created_at`)
);
CREATE TABLE `hn_phone` (
`id` int unsigned NOT NULL AUTO_INCREMENT,
`user_id` int unsigned NOT NULL,
`is_primary` tinyint(1) NOT NULL DEFAULT '1',
`phone` varchar(255) NOT NULL,
PRIMARY KEY (`id`),
KEY `user_id` (`user_id`),
CONSTRAINT `hn_phone_ibfk_1` FOREIGN KEY (`user_id`) REFERENCES `hn_user` (`id`)
);
mysql> SELECT COUNT(*) FROM hn_user UNION SELECT COUNT(*) FROM hn_phone;
+----------+
| COUNT(*) |
+----------+
| 10000000 |
| 10200000 |
+----------+
2 rows in set (1.20 sec)
mysql> SELECT
u.id,
u.created_at,
ut.is_primary,
ut.phone
FROM
hn_user u
JOIN hn_phone ut ON u.id = ut.user_id
WHERE
ut.is_primary
ORDER BY
u.created_at DESC
LIMIT 10;
+---------+---------------------+------------+--------------------+
| id | created_at | is_primary | phone |
+---------+---------------------+------------+--------------------+
| 6906106 | 2023-08-12 06:08:25 | 1 | +61 02 5317 2261 |
| 6906106 | 2023-08-12 06:08:25 | 1 | +254 20 294 205 |
| 6738922 | 2023-08-12 06:07:12 | 1 | +61 02 1247 3361 |
| 6738922 | 2023-08-12 06:07:12 | 1 | +44 0131 8386 4494 |
| 7449553 | 2023-08-12 06:03:55 | 1 | +61 02 7649 6731 |
| 7449553 | 2023-08-12 06:03:55 | 1 | +61 02 7893 9835 |
| 6908862 | 2023-08-12 05:51:52 | 1 | +81 03 6743-6893 |
| 6908862 | 2023-08-12 05:51:52 | 1 | +44 0131 8414 7888 |
| 4134961 | 2023-08-12 05:51:42 | 1 | +1 614-908-1719 |
| 4134961 | 2023-08-12 05:51:42 | 1 | +44 0131 9898 8958 |
+---------+---------------------+------------+--------------------+
10 rows in set (0.00 sec)
mysql> WITH latest_event AS (
SELECT
event_id
FROM
performance_schema.events_statements_history_long
WHERE
sql_text LIKE 'SELECT u.id%'
ORDER BY
event_id DESC
LIMIT 1
)
SELECT
event_name,
TRUNCATE(
TIMER_WAIT / POW(10, 9),
3
) AS 'duration (msec)'
FROM
performance_schema.events_stages_history_long stg
JOIN latest_event ON stg.nesting_event_id = latest_event.event_id
UNION
SELECT
"total",
TRUNCATE(
TIMER_WAIT / POW(10, 9),
3
)
FROM
performance_schema.events_statements_history_long stmt
JOIN latest_event ON stmt.event_id = latest_event.event_id;
+------------------------------------------------+-----------------+
| event_name | duration (msec) |
+------------------------------------------------+-----------------+
| stage/sql/starting | 0.261 |
| stage/sql/Executing hook on transaction begin. | 0.003 |
| stage/sql/starting | 0.016 |
| stage/sql/checking permissions | 0.006 |
| stage/sql/checking permissions | 0.005 |
| stage/sql/Opening tables | 0.134 |
| stage/sql/init | 0.008 |
| stage/sql/System lock | 0.023 |
| stage/sql/optimizing | 0.034 |
| stage/sql/statistics | 0.087 |
| stage/sql/preparing | 0.074 |
| stage/sql/executing | 0.74 |
| stage/sql/end | 0.003 |
| stage/sql/query end | 0.003 |
| stage/sql/waiting for handler commit | 0.025 |
| stage/sql/closing tables | 0.019 |
| stage/sql/freeing items | 0.176 |
| stage/sql/cleaning up | 0.003 |
| total | 1.654 |
+------------------------------------------------+-----------------+
19 rows in set (0.00 sec)
[0]: https://github.com/stephanGarland/genSQL # shameless plug; it's super messy and probably unintuitive, but it's getting better/faster and has been a fun ride learning how fast you can make Python (and when to offload to C).