huggingface
diff --git a/‎packages/transformers/docs/plugins/preprocess.js‎
Lines changed: 1 addition & 0 deletions b/‎packages/transformers/docs/plugins/preprocess.js‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/transformers/src/models/models.js‎
Lines changed: 1 addition & 0 deletions b/‎packages/transformers/src/models/models.js‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/transformers/src/models/openai_privacy_filter/modeling_openai_privacy_filter.js‎
Lines changed: 17 additions & 0 deletions b/‎packages/transformers/src/models/openai_privacy_filter/modeling_openai_privacy_filter.js‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎packages/transformers/src/models/registry.js‎
Lines changed: 2 additions & 0 deletions b/‎packages/transformers/src/models/registry.js‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎packages/transformers/src/pipelines/token-classification.js‎
Lines changed: 132 additions & 33 deletions b/‎packages/transformers/src/pipelines/token-classification.js‎
Lines changed: 132 additions & 33 deletions
@@ -74,6 +74,7 @@ function transformType(expr) {
       .replace(/keyof\s+typeof\s+\w+/g, "string") // keyof typeof X -> string
       .replace(/typeof\s+\w+/g, "Object") // typeof X -> Object
       .replace(/\binfer\s+\w+/g, "any") // infer K -> any
+      .replace(/\breadonly\s+/g, "") // readonly T -> T
       .replace(/\(\s*\w[\w<>, ]*\s+extends\s+\w+\s*\?[^)]*\)/g, "any") // (X extends Y ? A : B) -> any
       .replace(/(?<!\w)\(\s*(\w+)\s*\)/g, "$1") // (any) -> any (unwrap parens around simple types, not after words like "function")
       .replace(/(\w+)\?\s*:/g, "$1:") // x?: T -> x: T
 
@@ -121,6 +121,7 @@ export * from './olmo/modeling_olmo.js';
 export * from './olmo2/modeling_olmo2.js';
 export * from './olmo3/modeling_olmo3.js';
 export * from './olmo_hybrid/modeling_olmo_hybrid.js';
+export * from './openai_privacy_filter/modeling_openai_privacy_filter.js';
 export * from './openelm/modeling_openelm.js';
 export * from './opt/modeling_opt.js';
 export * from './owlv2/modeling_owlv2.js';
 
@@ -0,0 +1,17 @@
+import { PreTrainedModel } from '../modeling_utils.js';
+import { SequenceClassifierOutput } from '../modeling_outputs.js';
+
+export class OpenAIPrivacyFilterPreTrainedModel extends PreTrainedModel {}
+export class OpenAIPrivacyFilterModel extends OpenAIPrivacyFilterPreTrainedModel {}
+
+export class OpenAIPrivacyFilterForTokenClassification extends OpenAIPrivacyFilterPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
@@ -97,6 +97,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
     ['mgp-str', 'MgpstrForSceneTextRecognition'],
 
     ['style_text_to_speech_2', 'StyleTextToSpeech2Model'],
+    ['openai_privacy_filter', 'OpenAIPrivacyFilterModel'],
 ]);
 
 const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
@@ -238,6 +239,7 @@ const MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = new Map([
     ['roberta', 'RobertaForTokenClassification'],
     ['xlm', 'XLMForTokenClassification'],
     ['xlm-roberta', 'XLMRobertaForTokenClassification'],
+    ['openai_privacy_filter', 'OpenAIPrivacyFilterForTokenClassification'],
 ]);
 
 export const MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = new Map([
 
@@ -8,28 +8,58 @@ import { max, softmax } from '../utils/maths.js';
  */
 
 /**
- * @typedef {Object} TokenClassificationSingle
- * @property {string} word The token/word classified. This is obtained by decoding the selected tokens.
- * @property {number} score The corresponding probability for `entity`.
- * @property {string} entity The entity predicted for that token/word.
- * @property {number} index The index of the corresponding token in the sentence.
- * @property {number} [start] The index of the start of the corresponding entity in the sentence.
- * @property {number} [end] The index of the end of the corresponding entity in the sentence.
- * @typedef {TokenClassificationSingle[]} TokenClassificationOutput
- *
- * @typedef {Object} TokenClassificationPipelineOptions Parameters specific to token classification pipelines.
+ * Strategy for fusing tokens based on the model prediction.
+ * - `"none"`: Return raw per-token predictions.
+ * - `"simple"`: Group entities using BIO / BIOES tags (see pipeline docs for details).
+ * @typedef {"none" | "simple"} AggregationStrategy
+ */
+
+/**
+ * @typedef {Object} TokenClassificationPipelineOptions
  * @property {string[]} [ignore_labels] A list of labels to ignore.
+ * @property {AggregationStrategy} [aggregation_strategy="none"] Token-fusion strategy.
+ * When set to anything other than `"none"`, results use `entity_group` instead of `entity`.
+ */
+
+/**
+ * Single element of a token-classification result, parameterised by the options type `O` so that
+ * `entity` vs. `entity_group` is known statically based on `aggregation_strategy`.
  *
- * @typedef {TextPipelineConstructorArgs & TokenClassificationPipelineCallback & Disposable} TokenClassificationPipelineType
+ * - Grouped (present when `O["aggregation_strategy"]` is `"simple"`):
+ *     `{ word, score, entity_group }`
+ * - Raw (the default — when `aggregation_strategy` is missing, `"none"`, or `undefined`):
+ *     `{ word, score, entity, index }`
+ * - Both variants also carry optional `start` / `end` character offsets.
+ *
+ * When `O` is the untyped `TokenClassificationPipelineOptions`, the element is the union of both shapes,
+ * narrowable via `if ("entity_group" in item)` / `if (item.entity !== undefined)`.
+ *
+ * @template {TokenClassificationPipelineOptions | undefined} [O=TokenClassificationPipelineOptions]
+ * @typedef {_PickElement<O>[]} TokenClassificationOutput
+ */
+
+/**
+ * @template {TokenClassificationPipelineOptions | undefined} O
+ * @typedef {O extends undefined
+ *   ? _Raw
+ *   : "aggregation_strategy" extends keyof O
+ *     ? (O extends { aggregation_strategy?: infer A }
+ *       ? ([A] extends ["simple"] ? _Grouped
+ *         : [A] extends ["none" | undefined] ? _Raw
+ *         : _Raw | _Grouped)
+ *       : _Raw)
+ *     : _Raw} _PickElement
  */
 
 /**
- * @template T
- * @typedef {T extends string[] ? TokenClassificationOutput[] : TokenClassificationOutput} TokenClassificationPipelineResult
+ * @typedef {{ word: string, score: number, entity: string, index: number, entity_group?: undefined, start?: number, end?: number }} _Raw
+ * @typedef {{ word: string, score: number, entity_group: string, entity?: undefined, index?: undefined, start?: number, end?: number }} _Grouped
  */
 
 /**
- * @typedef {<T extends string | string[]>(texts: T, options?: TokenClassificationPipelineOptions) => Promise<TokenClassificationPipelineResult<T>>} TokenClassificationPipelineCallback
+ * @typedef {TextPipelineConstructorArgs & TokenClassificationPipelineCallback & Disposable} TokenClassificationPipelineType
+ *
+ * @typedef {<Q extends string | string[], const O extends TokenClassificationPipelineOptions = {}>(texts: Q, options?: O) => Promise<Q extends string[] ? TokenClassificationOutput<O>[] : TokenClassificationOutput<O>>} TokenClassificationPipelineCallback
  */
 
 /**
@@ -64,11 +94,29 @@ import { max, softmax } from '../utils/maths.js';
  * //   { entity: 'I-LOC', score: 0.9975294470787048, index: 8, word: 'America' }
  * // ]
  * ```
+ *
+ * **Example:** Group adjacent BIO/BIOES tokens into entity spans using `aggregation_strategy: "simple"`.
+ * ```javascript
+ * import { pipeline } from '@huggingface/transformers';
+ *
+ * const classifier = await pipeline('token-classification', 'Xenova/bert-base-NER');
+ * const output = await classifier('My name is Sarah and I live in London', { aggregation_strategy: 'simple' });
+ * // [
+ * //   { entity_group: 'PER', score: 0.9985477924346924, word: 'Sarah' },
+ * //   { entity_group: 'LOC', score: 0.999621570110321, word: 'London' }
+ * // ]
+ * ```
  */
 export class TokenClassificationPipeline
     extends /** @type {new (options: TextPipelineConstructorArgs) => TokenClassificationPipelineType} */ (Pipeline)
 {
-    async _call(texts, { ignore_labels = ['O'] } = {}) {
+    async _call(texts, { ignore_labels = ['O'], aggregation_strategy = 'none' } = {}) {
+        if (aggregation_strategy !== 'none' && aggregation_strategy !== 'simple') {
+            throw new Error(
+                `Invalid aggregation_strategy: "${aggregation_strategy}". Must be one of "none" or "simple".`,
+            );
+        }
+
         const isBatched = Array.isArray(texts);
 
         // Run tokenization
@@ -86,43 +134,94 @@ export class TokenClassificationPipeline
 
         const toReturn = [];
         for (let i = 0; i < logits.dims[0]; ++i) {
-            const ids = model_inputs.input_ids[i];
+            const ids = model_inputs.input_ids[i].tolist();
             const batch = logits[i];
 
-            // List of tokens that aren't ignored
             const tokens = [];
             for (let j = 0; j < batch.dims[0]; ++j) {
                 const tokenData = batch[j];
                 const topScoreIndex = max(tokenData.data)[1];
 
                 const entity = id2label ? id2label[topScoreIndex] : `LABEL_${topScoreIndex}`;
-                if (ignore_labels.includes(entity)) {
-                    // We predicted a token that should be ignored. So, we skip it.
-                    continue;
-                }
+                if (ignore_labels.includes(entity)) continue;
 
                 // TODO add option to keep special tokens?
-                const word = this.tokenizer.decode([ids[j].item()], { skip_special_tokens: true });
-                if (word === '') {
-                    // Was a special token. So, we skip it.
-                    continue;
-                }
+                const word = this.tokenizer.decode([ids[j]], { skip_special_tokens: true });
+                if (word === '') continue; // Was a special token.
 
                 const scores = softmax(tokenData.data);
-
                 tokens.push({
-                    entity: entity,
+                    entity,
                     score: scores[topScoreIndex],
                     index: j,
-                    word: word,
-
+                    word,
                     // TODO: Add support for start and end
-                    // start: null,
-                    // end: null,
                 });
             }
-            toReturn.push(tokens);
+
+            toReturn.push(aggregation_strategy === 'simple' ? groupEntities(tokens, ids, this.tokenizer) : tokens);
         }
         return isBatched ? toReturn : toReturn[0];
     }
 }
+
+/**
+ * Split a raw entity label into its BIOES prefix and tag.
+ *
+ * @param {string} entity
+ * @returns {readonly [prefix: 'B'|'I'|'E'|'S', tag: string]}
+ */
+function getTag(entity) {
+    const p = entity[0];
+    return entity[1] === '-' && (p === 'B' || p === 'I' || p === 'E' || p === 'S')
+        ? [p, entity.slice(2)]
+        : ['I', entity];
+}
+
+/**
+ * Group raw per-token predictions into entity spans using the SIMPLE strategy.
+ *
+ * The only "continue" predicate is: a non-`B`/non-`S` token whose tag matches
+ * the open group's tag, when that group hasn't been closed by an `E` / `S`.
+ * Everything else starts a fresh group.
+ *
+ * @param {_Raw[]} tokens
+ * @param {number[]} ids Full input_ids for the sequence (indexed by `token.index`), used to re-decode
+ *   each group so the joined `word` matches what the tokenizer would produce.
+ * @param {any} tokenizer
+ * @returns {_Grouped[]}
+ */
+function groupEntities(tokens, ids, tokenizer) {
+    /** @type {{ tag: string, start: number, end: number }[]} */
+    const groups = []; // each entry is a [start, end) slice into `tokens`, plus the shared tag
+    let openTag = null; // null = no open group
+
+    for (let i = 0; i < tokens.length; ++i) {
+        const [prefix, tag] = getTag(tokens[i].entity);
+        const extend = openTag === tag && prefix !== 'B' && prefix !== 'S';
+        if (extend) {
+            groups[groups.length - 1].end = i + 1;
+            // `E` terminates the group; subsequent `I`/`E`/`S` start fresh.
+            if (prefix === 'E') openTag = null;
+        } else {
+            groups.push({ tag, start: i, end: i + 1 });
+            // `S` opens and immediately closes; anything else leaves the group open
+            // (including a leading `E` — best-effort recovery for a malformed sequence).
+            openTag = prefix === 'S' ? null : tag;
+        }
+    }
+
+    return groups.map(({ tag, start, end }) => {
+        let scoreSum = 0;
+        const groupIds = [];
+        for (let i = start; i < end; ++i) {
+            scoreSum += tokens[i].score;
+            groupIds.push(ids[tokens[i].index]);
+        }
+        return {
+            entity_group: tag,
+            score: scoreSum / (end - start),
+            word: tokenizer.decode(groupIds, { skip_special_tokens: true }),
+        };
+    });
+}