@@ -62,7 +62,7 @@ export interface SimulationError {
62
62
| 'non_additive_processor_failure' ;
63
63
}
64
64
65
- export type DocSimulationStatus = 'parsed' | 'partially_parsed' | 'failed' ;
65
+ export type DocSimulationStatus = 'parsed' | 'partially_parsed' | 'skipped' | ' failed';
66
66
67
67
export interface SimulationDocReport {
68
68
detected_fields : Array < { processor_id : string ; name : string } > ;
@@ -75,6 +75,7 @@ export interface ProcessorMetrics {
75
75
detected_fields : string [ ] ;
76
76
errors : SimulationError [ ] ;
77
77
failure_rate : number ;
78
+ skipped_rate : number ;
78
79
success_rate : number ;
79
80
}
80
81
@@ -113,7 +114,6 @@ export const simulateProcessing = async ({
113
114
/* 1. Prepare data for either simulation types (ingest, pipeline), prepare simulation body for the mandatory pipeline simulation */
114
115
const simulationData = prepareSimulationData ( params ) ;
115
116
const pipelineSimulationBody = preparePipelineSimulationBody ( simulationData ) ;
116
-
117
117
/**
118
118
* 2. Run both pipeline and ingest simulations in parallel.
119
119
* - The pipeline simulation is used to extract the documents reports and the processor metrics. This always runs.
@@ -188,7 +188,16 @@ const prepareSimulationProcessors = (
188
188
} as ProcessorDefinition ;
189
189
} ) ;
190
190
191
- return formatToIngestProcessors ( processors ) ;
191
+ const dotExpanderProcessor : Pick < IngestProcessorContainer , 'dot_expander' > = {
192
+ dot_expander : {
193
+ field : '*' ,
194
+ override : true ,
195
+ } ,
196
+ } ;
197
+
198
+ const formattedProcessors = formatToIngestProcessors ( processors ) ;
199
+
200
+ return [ dotExpanderProcessor , ...formattedProcessors ] ;
192
201
} ;
193
202
194
203
const prepareSimulationData = ( params : ProcessingSimulationParams ) => {
@@ -351,10 +360,18 @@ const computePipelineSimulationResult = (
351
360
const processorsMap = initProcessorMetricsMap ( processing ) ;
352
361
353
362
const docReports = simulationResult . docs . map ( ( docResult , id ) => {
354
- const { errors, status, value } = getLastDoc ( docResult ) ;
363
+ const { errors, status, value } = getLastDoc ( docResult , sampleDocs [ id ] . _source ) ;
355
364
356
365
const diff = computeSimulationDocDiff ( docResult , sampleDocs [ id ] . _source ) ;
357
366
367
+ docResult . processor_results . forEach ( ( processor ) => {
368
+ const procId = processor . tag ;
369
+
370
+ if ( procId && isSkippedProcessor ( processor ) ) {
371
+ processorsMap [ procId ] . skipped_rate ++ ;
372
+ }
373
+ } ) ;
374
+
358
375
diff . detected_fields . forEach ( ( { processor_id, name } ) => {
359
376
processorsMap [ processor_id ] . detected_fields . push ( name ) ;
360
377
} ) ;
@@ -392,6 +409,7 @@ const initProcessorMetricsMap = (
392
409
detected_fields : [ ] ,
393
410
errors : [ ] ,
394
411
failure_rate : 0 ,
412
+ skipped_rate : 0 ,
395
413
success_rate : 1 ,
396
414
} ,
397
415
] ) ;
@@ -408,30 +426,47 @@ const extractProcessorMetrics = ({
408
426
} ) => {
409
427
return mapValues ( processorsMap , ( metrics ) => {
410
428
const failureRate = metrics . failure_rate / sampleSize ;
411
- const successRate = 1 - failureRate ;
429
+ const skippedRate = metrics . skipped_rate / sampleSize ;
430
+ const successRate = 1 - skippedRate - failureRate ;
412
431
const detected_fields = uniq ( metrics . detected_fields ) ;
413
432
const errors = uniqBy ( metrics . errors , ( error ) => error . message ) ;
414
433
415
434
return {
416
435
detected_fields,
417
436
errors,
418
437
failure_rate : parseFloat ( failureRate . toFixed ( 2 ) ) ,
438
+ skipped_rate : parseFloat ( skippedRate . toFixed ( 2 ) ) ,
419
439
success_rate : parseFloat ( successRate . toFixed ( 2 ) ) ,
420
440
} ;
421
441
} ) ;
422
442
} ;
423
443
424
444
const getDocumentStatus = ( doc : SuccessfulIngestSimulateDocumentResult ) : DocSimulationStatus => {
425
- if ( doc . processor_results . every ( isSuccessfulProcessor ) ) return 'parsed' ;
445
+ // Remove the always present base processor for dot expander
446
+ const processorResults = doc . processor_results . slice ( 1 ) ;
447
+
448
+ if ( processorResults . every ( isSkippedProcessor ) ) {
449
+ return 'skipped' ;
450
+ }
426
451
427
- if ( doc . processor_results . some ( isSuccessfulProcessor ) ) return 'partially_parsed' ;
452
+ if ( processorResults . every ( ( proc ) => isSuccessfulProcessor ( proc ) || isSkippedProcessor ( proc ) ) ) {
453
+ return 'parsed' ;
454
+ }
455
+
456
+ if ( processorResults . some ( isSuccessfulProcessor ) ) {
457
+ return 'partially_parsed' ;
458
+ }
428
459
429
460
return 'failed' ;
430
461
} ;
431
462
432
- const getLastDoc = ( docResult : SuccessfulIngestSimulateDocumentResult ) => {
463
+ const getLastDoc = ( docResult : SuccessfulIngestSimulateDocumentResult , sample : FlattenRecord ) => {
433
464
const status = getDocumentStatus ( docResult ) ;
434
- const lastDocSource = docResult . processor_results . at ( - 1 ) ?. doc ?. _source ?? { } ;
465
+ const lastDocSource =
466
+ docResult . processor_results
467
+ . slice ( 1 ) // Remove the always present base processor for dot expander
468
+ . filter ( ( proc ) => ! isSkippedProcessor ( proc ) )
469
+ . at ( - 1 ) ?. doc ?. _source ?? sample ;
435
470
436
471
if ( status === 'parsed' ) {
437
472
return {
@@ -440,7 +475,7 @@ const getLastDoc = (docResult: SuccessfulIngestSimulateDocumentResult) => {
440
475
status,
441
476
} ;
442
477
} else {
443
- const { _errors, ...value } = lastDocSource ;
478
+ const { _errors = [ ] , ...value } = lastDocSource ;
444
479
return { value : flattenObjectNestedLast ( value ) , errors : _errors as SimulationError [ ] , status } ;
445
480
}
446
481
} ;
@@ -459,7 +494,7 @@ const computeSimulationDocDiff = (
459
494
const successfulProcessors = docResult . processor_results . filter ( isSuccessfulProcessor ) ;
460
495
461
496
const comparisonDocs = [
462
- { processor_id : 'sample ' , value : sample } ,
497
+ { processor_id : 'base ' , value : docResult . processor_results [ 0 ] ! . doc ! . _source } ,
463
498
...successfulProcessors . map ( ( proc ) => ( {
464
499
processor_id : proc . tag ,
465
500
value : omit ( proc . doc . _source , [ '_errors' ] ) ,
@@ -495,7 +530,7 @@ const computeSimulationDocDiff = (
495
530
496
531
// We might have updated fields that are not present in the original document because are generated by the previous processors.
497
532
// We exclude them from the list of fields that make the processor non-additive.
498
- const originalUpdatedFields = updatedFields . filter ( ( field ) => field in sample ) ;
533
+ const originalUpdatedFields = updatedFields . filter ( ( field ) => field in sample ) . sort ( ) ;
499
534
if ( ! isEmpty ( originalUpdatedFields ) ) {
500
535
diffResult . errors . push ( {
501
536
processor_id : nextDoc . processor_id ,
@@ -514,7 +549,8 @@ const prepareSimulationResponse = async (
514
549
detectedFields : DetectedField [ ]
515
550
) => {
516
551
const successRate = computeSuccessRate ( docReports ) ;
517
- const failureRate = 1 - successRate ;
552
+ const skippedRate = computeSkippedRate ( docReports ) ;
553
+ const failureRate = 1 - skippedRate - successRate ;
518
554
const isNotAdditiveSimulation = some ( processorsMetrics , ( metrics ) =>
519
555
metrics . errors . some ( isNonAdditiveSimulationError )
520
556
) ;
@@ -524,6 +560,7 @@ const prepareSimulationResponse = async (
524
560
documents : docReports ,
525
561
processors_metrics : processorsMetrics ,
526
562
failure_rate : parseFloat ( failureRate . toFixed ( 2 ) ) ,
563
+ skipped_rate : parseFloat ( skippedRate . toFixed ( 2 ) ) ,
527
564
success_rate : parseFloat ( successRate . toFixed ( 2 ) ) ,
528
565
is_non_additive_simulation : isNotAdditiveSimulation ,
529
566
} ;
@@ -538,10 +575,12 @@ const prepareSimulationFailureResponse = (error: SimulationError) => {
538
575
detected_fields : [ ] ,
539
576
errors : [ error ] ,
540
577
failure_rate : 1 ,
578
+ skipped_rate : 0 ,
541
579
success_rate : 0 ,
542
580
} ,
543
581
} ,
544
582
failure_rate : 1 ,
583
+ skipped_rate : 0 ,
545
584
success_rate : 0 ,
546
585
is_non_additive_simulation : isNonAdditiveSimulationError ( error ) ,
547
586
} ;
@@ -597,6 +636,12 @@ const computeSuccessRate = (docs: SimulationDocReport[]) => {
597
636
return successfulCount / docs . length ;
598
637
} ;
599
638
639
+ const computeSkippedRate = ( docs : SimulationDocReport [ ] ) => {
640
+ const skippedCount = docs . reduce ( ( rate , doc ) => ( rate += doc . status === 'skipped' ? 1 : 0 ) , 0 ) ;
641
+
642
+ return skippedCount / docs . length ;
643
+ } ;
644
+
600
645
const computeMappingProperties = ( detectedFields : NamedFieldDefinitionConfig [ ] ) => {
601
646
return Object . fromEntries ( detectedFields . map ( ( { name, type } ) => [ name , { type } ] ) ) ;
602
647
} ;
@@ -609,6 +654,12 @@ const isSuccessfulProcessor = (
609
654
) : processor is WithRequired < IngestSimulatePipelineSimulation , 'doc' | 'tag' > =>
610
655
processor . status === 'success' && ! ! processor . tag ;
611
656
657
+ const isSkippedProcessor = (
658
+ processor : IngestSimulatePipelineSimulation
659
+ ) : processor is WithRequired < IngestSimulatePipelineSimulation , 'tag' > =>
660
+ // @ts -expect-error Looks like the IngestSimulatePipelineSimulation.status is not typed correctly and misses the 'skipped' status
661
+ processor . status === 'skipped' ;
662
+
612
663
// TODO: update type once Kibana updates to elasticsearch-js 8.17
613
664
const isMappingFailure = ( entry : any ) => entry . doc ?. error ?. type === 'document_parsing_exception' ;
614
665
0 commit comments