Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions contentcuration/contentcuration/tests/test_user.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
from .base import StudioTestCase
from .testdata import fileobj_video
from contentcuration.models import DEFAULT_CONTENT_DEFAULTS
from contentcuration.models import File
from contentcuration.models import Invitation
from contentcuration.models import Language
from contentcuration.models import User
from contentcuration.models import UserSubscription
from contentcuration.tests import testdata
Expand Down Expand Up @@ -163,6 +165,80 @@ def test_user_csv_export(self):
self.assertIn(_format_size(videos[index - 1].file_size), row)
self.assertEqual(index, len(videos))

def test_user_csv_export_reports_channel_and_content_metadata(self):

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

praise: Asserting specific field values (row["Channel"], row["Author"], row["Language"], etc.) rather than just presence makes this a meaningful regression guard — it will catch aliasing mistakes in the CTE column names, not just "the CSV was produced".

language = Language.objects.create(lang_code="fr", readable_name="French")
file_record = File.objects.filter(
contentnode__tree_id=self.channel.main_tree.tree_id
).first()
file_record.uploaded_by = self.user
file_record.original_filename = "sample-video.mp4"
file_record.language = None
file_record.save()

contentnode = file_record.contentnode
contentnode.title = "CSV Content Title"
contentnode.description = "CSV Description"
contentnode.author = "CSV Author"
contentnode.language = language
contentnode.license_description = "CSV License Description"
contentnode.copyright_holder = "CSV Copyright Holder"
contentnode.save()

with tempfile.NamedTemporaryFile(suffix=".csv") as tempf:
write_user_csv(self.user, path=tempf.name)

with io.open(tempf.name, "r", encoding="utf-8") as csv_file:
rows = list(csv.DictReader(csv_file, delimiter=","))

self.assertTrue(rows)
row = rows[0]
self.assertEqual(row["Channel"], self.channel.name)
self.assertEqual(row["Title"], "CSV Content Title")
self.assertEqual(row["Filename"], "sample-video.mp4")
self.assertEqual(row["Description"], "CSV Description")
self.assertEqual(row["Author"], "CSV Author")
self.assertEqual(row["Language"], "French")
self.assertEqual(row["License Description"], "CSV License Description")
self.assertEqual(row["Copyright Holder"], "CSV Copyright Holder")

def test_user_csv_export_reports_staged_files(self):
self.user.staged_files.create(checksum="stagedchecksum", file_size=2048)

with tempfile.NamedTemporaryFile(suffix=".csv") as tempf:
write_user_csv(self.user, path=tempf.name)

with io.open(tempf.name, "r", encoding="utf-8") as csv_file:
rows = list(csv.DictReader(csv_file, delimiter=","))

staged_rows = [row for row in rows if row["Filename"] == "Staged File"]
self.assertEqual(len(staged_rows), 1)
staged_row = staged_rows[0]
self.assertEqual(staged_row["Channel"], "No Channel")
self.assertEqual(staged_row["Title"], "No Resource")
self.assertEqual(staged_row["File Size"], _format_size(2048))
self.assertEqual(staged_row["URL"], "")

def test_user_csv_export_includes_files_without_contentnode(self):
file_without_contentnode = fileobj_video()
self.assertIsNone(file_without_contentnode.contentnode_id)
file_without_contentnode.uploaded_by = self.user
file_without_contentnode.original_filename = "no-contentnode.mp4"
file_without_contentnode.save()

with tempfile.NamedTemporaryFile(suffix=".csv") as tempf:
write_user_csv(self.user, path=tempf.name)

with io.open(tempf.name, "r", encoding="utf-8") as csv_file:
rows = list(csv.DictReader(csv_file, delimiter=","))

row = next(
row
for row in rows
if row["Filename"] == file_without_contentnode.original_filename
)
self.assertEqual(row["Title"], "No resource")
self.assertEqual(row["Channel"], "No Channel")


class UserEffectiveDiskSpaceTest(StudioTestCase):
def setUp(self):
Expand Down
136 changes: 103 additions & 33 deletions contentcuration/contentcuration/utils/csv_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@

from django.conf import settings
from django.contrib.sites.models import Site
from django.db.models import Exists
from django.db.models import F
from django.db.models import OuterRef
from django.db.models import Q
from django.db.models import Subquery
from django.db.models.sql.constants import LOUTER
from django.utils.translation import gettext as _
from le_utils.constants import content_kinds

from contentcuration.db.models.query import With
from contentcuration.models import Channel
from contentcuration.models import ContentNode
from contentcuration.models import generate_storage_url

if not os.path.exists(settings.CSV_ROOT):
Expand Down Expand Up @@ -43,29 +47,24 @@ def generate_user_csv_filename(user):


def _write_user_row(file, writer, domain):
filename = "{}.{}".format(file["checksum"], file["file_format__extension"])
filename = "{}.{}".format(file["checksum"], file["file_extension"])
writer.writerow(
[
file["channel_name"] or _("No Channel"),
file["contentnode__title"] or _("No resource"),
file["node_title"] or _("No resource"),
next(
(
k[1]
for k in content_kinds.choices
if k[0] == file["contentnode__kind_id"]
),
(k[1] for k in content_kinds.choices if k[0] == file["node_kind_id"]),
"",
),
file["original_filename"],
_format_size(file["file_size"] or 0),
generate_storage_url(filename),
file["contentnode__description"],
file["contentnode__author"],
file["language__readable_name"]
or file["contentnode__language__readable_name"],
file["contentnode__license__license_name"],
file["contentnode__license_description"],
file["contentnode__copyright_holder"],
file["node_description"],
file["node_author"],
file["file_language"] or file["node_language"],
file["node_license_name"],
file["node_license_description"],
file["node_copyright_holder"],
]
)

Expand Down Expand Up @@ -100,34 +99,105 @@ def write_user_csv(user, path=None):

domain = Site.objects.get(pk=1).domain

# Get all user files
channel_query = Channel.objects.filter(
Q(main_tree__tree_id=OuterRef("contentnode__tree_id"))
| Q(trash_tree__tree_id=OuterRef("contentnode__tree_id"))
# Build CTEs so we first reduce to this user's files, then resolve only
# needed content node and channel fields.
user_files_cte = With(

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

praise: The three-CTE decomposition (user_files_ctecontent_nodes_ctechannel_names_cte) is well-structured — each CTE reduces cardinality before the next step, which is exactly why the planner avoids the 676-iteration correlated channel subquery that was dominating the old plan. Easy to follow and a good pattern for future similar optimisations.

user.files.values(
"id",
"contentnode_id",
"original_filename",
"file_size",
"checksum",
file_extension=F("file_format__extension"),
file_language=F("language__readable_name"),
),
name="user_files",
)

content_nodes_cte = With(
user_files_cte.join(
ContentNode.objects.all(),
id=user_files_cte.col.contentnode_id,
)
.values(
"id",
"tree_id",
node_title=F("title"),
node_kind_id=F("kind_id"),
node_description=F("description"),
node_author=F("author"),
node_language=F("language__readable_name"),
node_license_name=F("license__license_name"),
node_license_description=F("license_description"),
node_copyright_holder=F("copyright_holder"),
)
.distinct(),
name="content_nodes",
)

main_channel_names = Channel.objects.filter(
Exists(
content_nodes_cte.queryset().filter(
tree_id=OuterRef("main_tree__tree_id")
)
)
).values(
tree_id=F("main_tree__tree_id"),
channel_name=F("name"),
)
trash_channel_names = Channel.objects.filter(
Exists(
content_nodes_cte.queryset().filter(
tree_id=OuterRef("trash_tree__tree_id")
)
)
).values(
tree_id=F("trash_tree__tree_id"),
channel_name=F("name"),
)
channel_names_cte = With(
main_channel_names.union(trash_channel_names), name="channel_names"
)

user_files = (
user.files.select_related("language", "contentnode", "file_format")
content_nodes_cte.join(
Comment thread
bjester marked this conversation as resolved.
user_files_cte.queryset(),
contentnode_id=content_nodes_cte.col.id,
_join_type=LOUTER,
)
.with_cte(user_files_cte)
.with_cte(content_nodes_cte)
.with_cte(channel_names_cte)
.annotate(
channel_name=Subquery(channel_query.values_list("name", flat=True)[:1])
channel_name=Subquery(
channel_names_cte.queryset()
.filter(tree_id=content_nodes_cte.col.tree_id)
.values("channel_name")[:1]
),
node_title=content_nodes_cte.col.node_title,
node_kind_id=content_nodes_cte.col.node_kind_id,
node_description=content_nodes_cte.col.node_description,
node_author=content_nodes_cte.col.node_author,
node_language=content_nodes_cte.col.node_language,
node_license_name=content_nodes_cte.col.node_license_name,
node_license_description=content_nodes_cte.col.node_license_description,
node_copyright_holder=content_nodes_cte.col.node_copyright_holder,
)
.values(
"channel_name",
"original_filename",
"file_size",
"checksum",
"file_format__extension",
"language__readable_name",
"contentnode__title",
"contentnode__language__readable_name",
"contentnode__license__license_name",
"contentnode__kind_id",
"contentnode__description",
"contentnode__author",
"contentnode__provider",
"contentnode__aggregator",
"contentnode__license_description",
"contentnode__copyright_holder",
"file_extension",
"file_language",
"node_title",
"node_kind_id",
"node_description",
"node_author",
"node_language",
"node_license_name",
"node_license_description",
"node_copyright_holder",
)
)
for file in user_files:
Expand Down
Loading