From c3bf722b1f2cb72784af76008c73df941efacefc Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Mon, 15 Jun 2026 14:29:08 +0200 Subject: [PATCH] chore(schema): re-sync vendored ingest.v1.json from data-ingestors master The embedded CLI copy had drifted from tracebloc/data-ingestors master, so the `Schema drift check` CI job (scripts/sync-schema.sh --check) fails on every cli PR. Re-synced via `scripts/sync-schema.sh`. Brings in the current upstream schema, including a target_size description change ("[height, width]" -> "[width, height]", noting it matches PIL.Image.size + ImageResolutionValidator) and the latest category-conditional blocks. Unblocks the drift check for open cli PRs (e.g. #78). Separate from data-ingestors itself and from client-runtime#106 (which synced client-runtime's own vendored copy). Co-Authored-By: Claude Opus 4.8 --- internal/schema/ingest.v1.json | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/internal/schema/ingest.v1.json b/internal/schema/ingest.v1.json index d8eb83c..99a509f 100644 --- a/internal/schema/ingest.v1.json +++ b/internal/schema/ingest.v1.json @@ -32,6 +32,7 @@ "semantic_segmentation", "instance_segmentation", "text_classification", + "token_classification", "tabular_classification", "tabular_regression", "time_series_forecasting", @@ -86,7 +87,7 @@ "texts": { "type": "string", "minLength": 1, - "description": "Directory holding text files referenced by the labels CSV. Required for text_classification." + "description": "Directory holding text files referenced by the labels CSV. Required for text_classification and token_classification." }, "sequences": { @@ -143,7 +144,7 @@ "items": { "type": "integer", "minimum": 1 }, "minItems": 2, "maxItems": 2, - "description": "[height, width] to resize images to. Required for keypoint_detection (no default — depends on the customer's pose model). Other image categories use category defaults if unset." + "description": "[width, height] to resize images to. Required for keypoint_detection (no default — depends on the customer's pose model). Other image categories use category defaults if unset. The order matches PIL.Image.size and what ImageResolutionValidator expects." }, "number_of_keypoints": { @@ -198,7 +199,7 @@ "items": { "type": "integer", "minimum": 1 }, "minItems": 2, "maxItems": 2, - "description": "[height, width]. Image categories only. Default [512, 512]." + "description": "[width, height]. Image categories only. Default [512, 512]. The order matches PIL.Image.size and what ImageResolutionValidator expects." }, "extension": { "type": "string", @@ -321,6 +322,14 @@ }, "then": { "required": ["texts"] } }, + { + "description": "token_classification requires `texts`.", + "if": { + "properties": { "category": { "const": "token_classification" } }, + "required": ["category"] + }, + "then": { "required": ["texts"] } + }, { "description": "masked_language_modeling requires `sequences`.", "if": { @@ -390,6 +399,7 @@ "semantic_segmentation", "instance_segmentation", "text_classification", + "token_classification", "tabular_classification" ] } @@ -397,6 +407,20 @@ "required": ["category"] }, "then": { "required": ["label"] } + }, + { + "description": "Self-supervised categories MUST NOT set `label`. The shipped CSV has no label column, and the framework registers no edge-label metadata for them. Setting `label` anyway used to ingest rows successfully, then crash at backend registration with a misleading HTTP 400 'No data found' (issue #213). Reject at submission instead.", + "if": { + "properties": { + "category": { + "enum": [ + "masked_language_modeling" + ] + } + }, + "required": ["category"] + }, + "then": { "not": { "required": ["label"] } } } ] }