Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("adapter_processor_v2", "0001_initial"),
("prompt_profile_manager_v2", "0005_profilemanager_shared_to_org_and_more"),
]

operations = [
migrations.AlterField(
model_name="profilemanager",
name="vector_store",
field=models.ForeignKey(
blank=True,
db_comment="Field to store the chosen vector store.",
null=True,
on_delete=django.db.models.deletion.PROTECT,
related_name="profiles_vector_store",
to="adapter_processor_v2.adapterinstance",

Check failure on line 21 in backend/prompt_studio/prompt_profile_manager_v2/migrations/0006_make_extraction_adapters_nullable.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Define a constant instead of duplicating this literal "adapter_processor_v2.adapterinstance" 3 times.

See more on https://sonarcloud.io/project/issues?id=Zipstack_unstract&issues=AZ8ZVOc23i2zxRL0UHrS&open=AZ8ZVOc23i2zxRL0UHrS&pullRequest=2131
),
),
migrations.AlterField(
model_name="profilemanager",
name="embedding_model",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.PROTECT,
related_name="profiles_embedding_model",
to="adapter_processor_v2.adapterinstance",
),
),
migrations.AlterField(
model_name="profilemanager",
name="x2text",
field=models.ForeignKey(
blank=True,
db_comment="Field to store the X2Text Adapter chosen by the user",
null=True,
on_delete=django.db.models.deletion.PROTECT,
related_name="profiles_x2text",
to="adapter_processor_v2.adapterinstance",
),
),
]
12 changes: 6 additions & 6 deletions backend/prompt_studio/prompt_profile_manager_v2/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,15 @@ class RetrievalStrategy(models.TextChoices):
vector_store = models.ForeignKey(
AdapterInstance,
db_comment="Field to store the chosen vector store.",
blank=False,
null=False,
blank=True,
null=True,
on_delete=models.PROTECT,
related_name="profiles_vector_store",
)
embedding_model = models.ForeignKey(
AdapterInstance,
blank=False,
null=False,
blank=True,
null=True,
on_delete=models.PROTECT,
related_name="profiles_embedding_model",
)
Expand All @@ -84,8 +84,8 @@ class RetrievalStrategy(models.TextChoices):
x2text = models.ForeignKey(
AdapterInstance,
db_comment="Field to store the X2Text Adapter chosen by the user",
blank=False,
null=False,
blank=True,
null=True,
on_delete=models.PROTECT,
related_name="profiles_x2text",
)
Expand Down
52 changes: 49 additions & 3 deletions backend/prompt_studio/prompt_profile_manager_v2/serializers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging

from adapter_processor_v2.adapter_processor import AdapterProcessor
from rest_framework import serializers

from backend.serializers import AuditSerializer
from prompt_studio.prompt_profile_manager_v2.constants import ProfileManagerKeys
Expand All @@ -9,6 +10,14 @@

logger = logging.getLogger(__name__)

# Extraction adapter fields that are only required when at least one prompt
# using this profile needs text extraction (extraction_inputs != "image").
_TEXT_EXTRACTION_FIELDS = (
ProfileManagerKeys.VECTOR_STORE,
ProfileManagerKeys.EMBEDDING_MODEL,
ProfileManagerKeys.X2TEXT,
)


class ProfileManagerSerializer(AuditSerializer):
class Meta:
Expand All @@ -18,12 +27,49 @@
# the DRF auto-validator that 400s on re-save / PUT before the view runs.
validators = []

def validate(self, attrs):
"""Enforce x2text/embedding/vector_store when text extraction needed.

These fields are nullable at the DB level to support image-only
profiles, but must be populated when any prompt using this profile
requires text extraction.
"""
attrs = super().validate(attrs)

instance = self.instance
if instance is not None:
# Update: check prompts currently linked to this profile
needs_text = instance.tool_studio_prompts.exclude(
extraction_inputs="image"
).exists()
else:
# Create: no prompts linked yet — require extraction adapters
# by default so existing flows are unaffected
needs_text = True

if needs_text:
missing = [
field
for field in _TEXT_EXTRACTION_FIELDS
if not attrs.get(field)
and (instance is None or not getattr(instance, f"{field}_id", None))
]
Comment on lines +50 to +56

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Validation gap on explicit null in PATCH/PUT: when a client sends {"vector_store": null} the check not attrs.get(field) evaluates True (None is falsy), but not getattr(instance, "vector_store_id", None) evaluates False (the old FK is still on the instance), so the field is never added to missing and the null is saved silently — even when linked prompts still need text extraction. The guard needs to treat an inbound null as a removal, not as "not provided".

Suggested change
if needs_text:
missing = [
field
for field in _TEXT_EXTRACTION_FIELDS
if not attrs.get(field)
and (instance is None or not getattr(instance, f"{field}_id", None))
]
if needs_text:
missing = [
field
for field in _TEXT_EXTRACTION_FIELDS
if not (
# Use incoming value when explicitly provided, else fall
# back to whatever is already set on the instance.
attrs.get(field)
or (
field not in attrs
and instance is not None
and getattr(instance, f"{field}_id", None)
)
)
]
Prompt To Fix With AI
This is a comment left during a code review.
Path: backend/prompt_studio/prompt_profile_manager_v2/serializers.py
Line: 50-56

Comment:
Validation gap on explicit null in PATCH/PUT: when a client sends `{"vector_store": null}` the check `not attrs.get(field)` evaluates True (None is falsy), but `not getattr(instance, "vector_store_id", None)` evaluates False (the old FK is still on the instance), so the field is never added to `missing` and the null is saved silently — even when linked prompts still need text extraction. The guard needs to treat an inbound null as a removal, not as "not provided".

```suggestion
        if needs_text:
            missing = [
                field
                for field in _TEXT_EXTRACTION_FIELDS
                if not (
                    # Use incoming value when explicitly provided, else fall
                    # back to whatever is already set on the instance.
                    attrs.get(field)
                    or (
                        field not in attrs
                        and instance is not None
                        and getattr(instance, f"{field}_id", None)
                    )
                )
            ]
```

How can I resolve this? If you propose a fix, please make it concise.

Fix in Claude Code

if missing:
raise serializers.ValidationError(
{
field: "This field is required when any linked prompt "
"uses text extraction."
for field in missing
}

Check warning on line 63 in backend/prompt_studio/prompt_profile_manager_v2/serializers.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Replace with dict fromkeys method call

See more on https://sonarcloud.io/project/issues?id=Zipstack_unstract&issues=AZ8ZVOcw3i2zxRL0UHrR&open=AZ8ZVOcw3i2zxRL0UHrR&pullRequest=2131
)
return attrs

def to_representation(self, instance): # type: ignore
rep: dict[str, str] = super().to_representation(instance)
llm = rep[ProfileManagerKeys.LLM]
embedding = rep[ProfileManagerKeys.EMBEDDING_MODEL]
vector_db = rep[ProfileManagerKeys.VECTOR_STORE]
x2text = rep[ProfileManagerKeys.X2TEXT]
embedding = rep.get(ProfileManagerKeys.EMBEDDING_MODEL)
vector_db = rep.get(ProfileManagerKeys.VECTOR_STORE)
x2text = rep.get(ProfileManagerKeys.X2TEXT)
if llm:
rep[ProfileManagerKeys.LLM] = AdapterProcessor.get_adapter_instance_by_id(llm)
if embedding:
Expand Down
4 changes: 4 additions & 0 deletions backend/prompt_studio/prompt_studio_core_v2/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ class ToolStudioPromptKeys:
# Webhook postprocessing settings
ENABLE_POSTPROCESSING_WEBHOOK = "enable_postprocessing_webhook"
POSTPROCESSING_WEBHOOK_URL = "postprocessing_webhook_url"
# Vision mode fields
EXTRACTION_INPUTS = "extraction_inputs"
SOURCE_OF_TRUTH = "source_of_truth"
SOURCE_FILE_PATH = "source_file_path"


class FileViewTypes:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,10 @@ def _build_prompt_output(
if lookup_config := get_lookup_config(prompt):
output["lookup_config"] = lookup_config

# Vision mode fields
output[TSPKeys.EXTRACTION_INPUTS] = prompt.extraction_inputs
output[TSPKeys.SOURCE_OF_TRUTH] = prompt.source_of_truth

output[TSPKeys.EVAL_SETTINGS] = {}
output[TSPKeys.EVAL_SETTINGS][TSPKeys.EVAL_SETTINGS_EVALUATE] = prompt.evaluate
output[TSPKeys.EVAL_SETTINGS][TSPKeys.EVAL_SETTINGS_MONITOR_LLM] = [monitor_llm]
Expand Down Expand Up @@ -825,6 +829,10 @@ def build_fetch_response_payload(
if lookup_config := get_lookup_config(prompt):
output["lookup_config"] = lookup_config

# Vision mode fields
output[TSPKeys.EXTRACTION_INPUTS] = prompt.extraction_inputs
output[TSPKeys.SOURCE_OF_TRUTH] = prompt.source_of_truth

output[TSPKeys.EVAL_SETTINGS] = {}
output[TSPKeys.EVAL_SETTINGS][TSPKeys.EVAL_SETTINGS_EVALUATE] = prompt.evaluate
output[TSPKeys.EVAL_SETTINGS][TSPKeys.EVAL_SETTINGS_MONITOR_LLM] = [monitor_llm]
Expand Down Expand Up @@ -874,6 +882,7 @@ def build_fetch_response_payload(
TSPKeys.FILE_NAME: doc_name,
TSPKeys.FILE_HASH: file_hash,
TSPKeys.FILE_PATH: extract_path,
TSPKeys.SOURCE_FILE_PATH: file_path,
Common.LOG_EVENTS_ID: StateStore.get(Common.LOG_EVENTS_ID),
TSPKeys.EXECUTION_SOURCE: ExecutionSource.IDE.value,
TSPKeys.CUSTOM_DATA: tool.custom_data,
Expand Down Expand Up @@ -1064,6 +1073,7 @@ def build_bulk_fetch_response_payload(
TSPKeys.FILE_NAME: doc_name,
TSPKeys.FILE_HASH: file_hash,
TSPKeys.FILE_PATH: extract_path,
TSPKeys.SOURCE_FILE_PATH: file_path,
Common.LOG_EVENTS_ID: StateStore.get(Common.LOG_EVENTS_ID),
TSPKeys.EXECUTION_SOURCE: ExecutionSource.IDE.value,
TSPKeys.CUSTOM_DATA: tool.custom_data,
Expand Down Expand Up @@ -1225,6 +1235,7 @@ def build_single_pass_payload(
TSPKeys.FILE_HASH: file_hash,
TSPKeys.FILE_NAME: doc_name,
TSPKeys.FILE_PATH: file_path,
TSPKeys.SOURCE_FILE_PATH: doc_path,
Common.LOG_EVENTS_ID: StateStore.get(Common.LOG_EVENTS_ID),
TSPKeys.EXECUTION_SOURCE: ExecutionSource.IDE.value,
TSPKeys.CUSTOM_DATA: tool.custom_data,
Expand Down Expand Up @@ -1950,6 +1961,9 @@ def _fetch_response(
output[TSPKeys.POSTPROCESSING_WEBHOOK_URL] = webhook_url
if lookup_config := get_lookup_config(prompt):
output["lookup_config"] = lookup_config
# Vision mode fields
output[TSPKeys.EXTRACTION_INPUTS] = prompt.extraction_inputs
output[TSPKeys.SOURCE_OF_TRUTH] = prompt.source_of_truth
# Eval settings for the prompt
output[TSPKeys.EVAL_SETTINGS] = {}
output[TSPKeys.EVAL_SETTINGS][TSPKeys.EVAL_SETTINGS_EVALUATE] = prompt.evaluate
Expand Down Expand Up @@ -2000,6 +2014,7 @@ def _fetch_response(
TSPKeys.FILE_NAME: doc_name,
TSPKeys.FILE_HASH: file_hash,
TSPKeys.FILE_PATH: doc_path,
TSPKeys.SOURCE_FILE_PATH: doc_path,
Comment on lines 2016 to +2017

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟠 Major | ⚡ Quick win

SOURCE_FILE_PATH points to the extracted text file, not the source document.

In _fetch_response, doc_path is reassigned at Line 1865 to the extract .txt path (and again to the summarize path at Line 1898), while the original source file is retained in file_path. Setting TSPKeys.SOURCE_FILE_PATH: doc_path therefore passes the extracted-text path downstream. run_vision_completion uses SOURCE_FILE_PATH to read and rasterise the original PDF, so vision-mode prompts on this IDE single-prompt path will fail to rasterise (no pages / read error). Every sibling builder (build_fetch_response_payload Line 885, build_bulk_fetch_response_payload Line 1076, build_single_pass_payload Line 1238) correctly uses the original source path here.

🐛 Proposed fix
             TSPKeys.FILE_PATH: doc_path,
-            TSPKeys.SOURCE_FILE_PATH: doc_path,
+            TSPKeys.SOURCE_FILE_PATH: file_path,
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
TSPKeys.FILE_PATH: doc_path,
TSPKeys.SOURCE_FILE_PATH: doc_path,
TSPKeys.FILE_PATH: doc_path,
TSPKeys.SOURCE_FILE_PATH: file_path,
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py` around
lines 2016 - 2017, In `_fetch_response`, `SOURCE_FILE_PATH` is being set from
the mutated `doc_path` instead of the original source document path. Update the
payload construction to use `file_path` for `TSPKeys.SOURCE_FILE_PATH` (while
keeping `TSPKeys.FILE_PATH` as-is), matching the behavior in
`build_fetch_response_payload`, `build_bulk_fetch_response_payload`, and
`build_single_pass_payload` so `run_vision_completion` can rasterize the
original PDF correctly.

Common.LOG_EVENTS_ID: StateStore.get(Common.LOG_EVENTS_ID),
TSPKeys.EXECUTION_SOURCE: ExecutionSource.IDE.value,
TSPKeys.CUSTOM_DATA: tool.custom_data,
Expand Down
3 changes: 3 additions & 0 deletions backend/prompt_studio/prompt_studio_registry_v2/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ class JsonSchemaKey:
ENABLE_POSTPROCESSING_WEBHOOK = "enable_postprocessing_webhook"
POSTPROCESSING_WEBHOOK_URL = "postprocessing_webhook_url"
WORD_CONFIDENCE_POSTAMBLE = "word_confidence_postamble"
# Vision mode fields
EXTRACTION_INPUTS = "extraction_inputs"
SOURCE_OF_TRUTH = "source_of_truth"


class SpecKey:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,19 @@ def frame_export_json(

embedding_suffix = ""
adapter_id = ""
vector_db = str(default_llm_profile.vector_store.id)
embedding_model = str(default_llm_profile.embedding_model.id)
# Extraction adapters may be null for image-only profiles
vector_db = (
str(default_llm_profile.vector_store.id)
if default_llm_profile.vector_store
else ""
)
embedding_model = (
str(default_llm_profile.embedding_model.id)
if default_llm_profile.embedding_model
else ""
)
llm = str(default_llm_profile.llm.id)
x2text = str(default_llm_profile.x2text.id)
x2text = str(default_llm_profile.x2text.id) if default_llm_profile.x2text else ""

# Tool settings
tool_settings = {}
Expand Down Expand Up @@ -328,36 +337,51 @@ def frame_export_json(
invalidated_outputs.append(prompt.prompt_key)
continue

vector_db = str(prompt.profile_manager.vector_store.id)
embedding_model = str(prompt.profile_manager.embedding_model.id)
llm = str(prompt.profile_manager.llm.id)
x2text = str(prompt.profile_manager.x2text.id)
adapter_id = str(prompt.profile_manager.embedding_model.adapter_id)
embedding_suffix = adapter_id.split("|")[0]
# Extraction adapters may be null for image-only prompts
pm = prompt.profile_manager
vector_db = str(pm.vector_store.id) if pm.vector_store else ""
embedding_model = str(pm.embedding_model.id) if pm.embedding_model else ""
llm = str(pm.llm.id)
x2text = str(pm.x2text.id) if pm.x2text else ""
if pm.embedding_model:
adapter_id = str(pm.embedding_model.adapter_id)
embedding_suffix = adapter_id.split("|")[0]
else:
adapter_id = ""
embedding_suffix = ""

output[JsonSchemaKey.PROMPT] = prompt.prompt
output[JsonSchemaKey.ACTIVE] = prompt.active
output[JsonSchemaKey.REQUIRED] = prompt.required
output[JsonSchemaKey.CHUNK_SIZE] = prompt.profile_manager.chunk_size
output[JsonSchemaKey.CHUNK_SIZE] = pm.chunk_size
output[JsonSchemaKey.VECTOR_DB] = vector_db
output[JsonSchemaKey.EMBEDDING] = embedding_model
output[JsonSchemaKey.X2TEXT_ADAPTER] = x2text
output[JsonSchemaKey.CHUNK_OVERLAP] = prompt.profile_manager.chunk_overlap
output[JsonSchemaKey.CHUNK_OVERLAP] = pm.chunk_overlap
output[JsonSchemaKey.LLM] = llm
output[JsonSchemaKey.PREAMBLE] = tool.preamble
output[JsonSchemaKey.POSTAMBLE] = tool.postamble
output[JsonSchemaKey.GRAMMAR] = grammar_list
output[JsonSchemaKey.TYPE] = prompt.enforce_type
output[JsonSchemaKey.NAME] = prompt.prompt_key
output[JsonSchemaKey.RETRIEVAL_STRATEGY] = (
prompt.profile_manager.retrieval_strategy
)
output[JsonSchemaKey.SIMILARITY_TOP_K] = (
prompt.profile_manager.similarity_top_k
)
output[JsonSchemaKey.SECTION] = prompt.profile_manager.section
output[JsonSchemaKey.REINDEX] = prompt.profile_manager.reindex
output[JsonSchemaKey.RETRIEVAL_STRATEGY] = pm.retrieval_strategy
output[JsonSchemaKey.SIMILARITY_TOP_K] = pm.similarity_top_k
output[JsonSchemaKey.SECTION] = pm.section
output[JsonSchemaKey.REINDEX] = pm.reindex
output[JsonSchemaKey.EMBEDDING_SUFFIX] = embedding_suffix
# Vision mode fields — force text_only when single-pass is enabled
if tool.single_pass_extraction_mode and prompt.extraction_inputs != "text":
logger.warning(
"Single-pass extraction enabled: forcing prompt '%s' "
"from extraction_inputs='%s' to 'text' in export",
prompt.prompt_key,
prompt.extraction_inputs,
)
output[JsonSchemaKey.EXTRACTION_INPUTS] = "text"
output[JsonSchemaKey.SOURCE_OF_TRUTH] = "text"
else:
output[JsonSchemaKey.EXTRACTION_INPUTS] = prompt.extraction_inputs
output[JsonSchemaKey.SOURCE_OF_TRUTH] = prompt.source_of_truth
# Webhook postprocessing settings
output[JsonSchemaKey.ENABLE_POSTPROCESSING_WEBHOOK] = (
prompt.enable_postprocessing_webhook
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("prompt_studio_v2", "0014_alter_toolstudioprompt_enforce_type"),
]

operations = [
migrations.AddField(
model_name="toolstudioprompt",
name="extraction_inputs",
field=models.TextField(
choices=[
("text", "Text only (default)"),
("image", "Page image only"),
("both", "Text and page image"),
],
db_comment="What inputs to send to the LLM: text, image, or both",
default="text",
),
),
migrations.AddField(
model_name="toolstudioprompt",
name="source_of_truth",
field=models.TextField(
choices=[
("text", "Text is source of truth"),
("image", "Image is source of truth"),
],
db_comment="Which input is source of truth "
"(only meaningful when extraction_inputs=both)",
default="text",
),
),
]
Loading