Don Kirkby Don Kirkby - 5 months ago 20
Python Question

Django query on child records without getting duplicate rows

I'm trying to write a Django query to find a set of parent records with certain kinds of child records. The problem is that a parent record with two children that match the search will included twice in the results.

How can I get each parent once, even when it has more than one matching child?

I've included a simple example below that demonstrates the problem.

Blog
is the parent, and
Entry
is the child. When I search for blogs that contain an entry with "Hello" in the title, I get two copies of Jimmy's blog.

Here are the records I created and the query I tried:

b = Blog(name="Jimmy's Jottings")
b.save()
Entry(blog=b, headline='Hello, World!').save()
Entry(blog=b, headline='Hello Kitty').save()

blog_count = Blog.objects.filter(entries__headline__contains='Hello').count()
assert blog_count == 1, blog_count


You can see there's only one blog, but the assert fails with a count of two.

Here's the full example:

# Tested with Django 1.9.2
import sys

import django
from django.apps import apps
from django.apps.config import AppConfig
from django.conf import settings
from django.db import connections, models, DEFAULT_DB_ALIAS
from django.db.models.base import ModelBase

NAME = 'udjango'


def main():
setup()

class Blog(models.Model):
name = models.CharField(max_length=100)
tagline = models.TextField()

def __str__(self): # __unicode__ on Python 2
return self.name

class Entry(models.Model):
blog = models.ForeignKey(Blog, related_name='entries')
headline = models.CharField(max_length=255)
body_text = models.TextField()

def __str__(self): # __unicode__ on Python 2
return self.headline

syncdb(Blog)
syncdb(Entry)

b = Blog(name="Jimmy's Jottings")
b.save()
Entry(blog=b, headline='Hello, World!').save()
Entry(blog=b, headline='Hello Kitty').save()

blog_count = Blog.objects.filter(entries__headline__contains='Hello').count()
assert blog_count == 1, blog_count

print('Done.')


def setup():
DB_FILE = NAME + '.db'
with open(DB_FILE, 'w'):
pass # wipe the database
settings.configure(
DEBUG=True,
DATABASES={
DEFAULT_DB_ALIAS: {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': DB_FILE}},
LOGGING={'version': 1,
'disable_existing_loggers': False,
'formatters': {
'debug': {
'format': '%(asctime)s[%(levelname)s]'
'%(name)s.%(funcName)s(): %(message)s',
'datefmt': '%Y-%m-%d %H:%M:%S'}},
'handlers': {
'console': {
'level': 'DEBUG',
'class': 'logging.StreamHandler',
'formatter': 'debug'}},
'root': {
'handlers': ['console'],
'level': 'WARN'},
'loggers': {
"django.db": {"level": "WARN"}}})
app_config = AppConfig(NAME, sys.modules['__main__'])
apps.populate([app_config])
django.setup()
original_new_func = ModelBase.__new__

@staticmethod
def patched_new(cls, name, bases, attrs):
if 'Meta' not in attrs:
class Meta:
app_label = NAME
attrs['Meta'] = Meta
return original_new_func(cls, name, bases, attrs)
ModelBase.__new__ = patched_new


def syncdb(model):
""" Standard syncdb expects models to be in reliable locations.

Based on https://github.com/django/django/blob/1.9.3
/django/core/management/commands/migrate.py#L285
"""
connection = connections[DEFAULT_DB_ALIAS]
with connection.schema_editor() as editor:
editor.create_model(model)

main()

Answer

The trick is to use a subquery that finds the blog ids of the matching children and then search for all blogs that have an id in that subquery. Then the subquery can have duplicates without causing duplicates in the main query.

Here's the fixed query:

blog_ids = Entry.objects.filter(headline__contains='Hello').values('blog_id')
blog_count = Blog.objects.filter(id__in=blog_ids).count()
assert blog_count == 1, blog_count

Here's the SQL query that generates:

SELECT  COUNT(*) AS "__count"
FROM    "udjango_blog"
WHERE   "udjango_blog"."id" IN
        (
        SELECT  U0."blog_id"
        FROM    "udjango_entry" U0
        WHERE   U0."headline" LIKE '%Hello%' ESCAPE '\'
        )