Skip to content

Commit 2428892

Browse files
committed
Use ES low-level API for bulk indexing. Workaround for gchq#2783
1 parent 99d6590 commit 2428892

File tree

1 file changed

+38
-49
lines changed

1 file changed

+38
-49
lines changed

stroom-search/stroom-search-elastic/src/main/java/stroom/search/elastic/indexing/ElasticIndexingFilter.java

Lines changed: 38 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,12 @@
4343

4444
import com.fasterxml.jackson.core.JsonFactory;
4545
import com.fasterxml.jackson.core.JsonGenerator;
46-
import org.elasticsearch.action.DocWriteRequest.OpType;
47-
import org.elasticsearch.action.bulk.BulkRequest;
48-
import org.elasticsearch.action.bulk.BulkResponse;
49-
import org.elasticsearch.action.index.IndexRequest;
46+
import com.fasterxml.jackson.databind.ObjectMapper;
47+
import org.apache.http.StatusLine;
5048
import org.elasticsearch.action.search.SearchRequest;
5149
import org.elasticsearch.action.search.SearchResponse;
5250
import org.elasticsearch.action.support.WriteRequest.RefreshPolicy;
51+
import org.elasticsearch.client.Request;
5352
import org.elasticsearch.client.RequestOptions;
5453
import org.elasticsearch.client.RestHighLevelClient;
5554
import org.elasticsearch.index.query.TermQueryBuilder;
@@ -62,17 +61,19 @@
6261
import org.elasticsearch.search.aggregations.bucket.composite.CompositeValuesSourceBuilder;
6362
import org.elasticsearch.search.aggregations.bucket.composite.TermsValuesSourceBuilder;
6463
import org.elasticsearch.search.builder.SearchSourceBuilder;
65-
import org.elasticsearch.xcontent.XContentType;
6664
import org.xml.sax.Attributes;
6765
import org.xml.sax.Locator;
6866
import org.xml.sax.SAXException;
6967

7068
import java.io.ByteArrayOutputStream;
7169
import java.io.IOException;
7270
import java.security.InvalidParameterException;
71+
import java.time.Duration;
72+
import java.time.Instant;
7373
import java.time.ZonedDateTime;
7474
import java.time.format.DateTimeFormatter;
7575
import java.util.ArrayList;
76+
import java.util.HashMap;
7677
import java.util.List;
7778
import java.util.Map;
7879
import javax.inject.Inject;
@@ -108,8 +109,9 @@ class ElasticIndexingFilter extends AbstractXMLFilter {
108109
private String indexNameDateFormat;
109110
private String indexNameDateFieldName = "@timestamp";
110111

111-
private final List<IndexRequest> indexRequests;
112-
private final ByteArrayOutputStream currentDocument;
112+
private final List<String> indexRequests = new ArrayList<>();
113+
private long indexRequestsDocCount = 0;
114+
private final ByteArrayOutputStream currentDocument = new ByteArrayOutputStream(INITIAL_JSON_STREAM_SIZE_BYTES);
113115
private final StringBuilder valueBuffer = new StringBuilder();
114116
private String currentDocFieldName = null;
115117
private int currentDocPropertyCount = 0;
@@ -136,9 +138,6 @@ class ElasticIndexingFilter extends AbstractXMLFilter {
136138
this.elasticClusterStore = elasticClusterStore;
137139
this.streamProcessorHolder = streamProcessorHolder;
138140
this.metaHolder = metaHolder;
139-
140-
indexRequests = new ArrayList<>();
141-
currentDocument = new ByteArrayOutputStream(INITIAL_JSON_STREAM_SIZE_BYTES);
142141
}
143142

144143
/**
@@ -392,28 +391,25 @@ private void processDocument() {
392391
if (currentDocPropertyCount > 0) {
393392
jsonGenerator.flush();
394393

395-
final IndexRequest indexRequest = new IndexRequest(formatIndexName())
396-
.opType(OpType.CREATE)
397-
.source(currentDocument.toByteArray(), XContentType.JSON);
398-
399-
// If an ingest pipeline name is specified, execute it when ingesting the document
400-
if (ingestPipelineName != null && !ingestPipelineName.isEmpty()) {
401-
indexRequest.setPipeline(ingestPipelineName);
402-
}
403-
404-
indexRequests.add(indexRequest);
394+
final HashMap<String, Object> indexMap = new HashMap<>();
395+
indexMap.put("_index", formatIndexName());
396+
final HashMap<String, Object> createMap = new HashMap<>();
397+
createMap.put("create", indexMap);
398+
indexRequests.add(new ObjectMapper().writeValueAsString(createMap));
399+
indexRequests.add(currentDocument.toString());
400+
indexRequestsDocCount++;
405401
}
406402

407-
if (indexRequests.size() >= batchSize) {
403+
if (indexRequestsDocCount >= batchSize) {
408404
indexDocuments();
409405
}
410406
} catch (IOException e) {
411407
fatalError("Failed to flush JSON to stream", e);
412408
} catch (Exception e) {
413409
fatalError(e.getMessage(), e);
410+
} finally {
411+
clearDocument();
414412
}
415-
416-
clearDocument();
417413
}
418414

419415
private void clearDocument() {
@@ -444,7 +440,6 @@ private boolean purgeDocumentsForStream(final RestHighLevelClient elasticClient,
444440
final BulkByScrollResponse deleteResponse = elasticClient.deleteByQuery(deleteRequest,
445441
RequestOptions.DEFAULT);
446442
final long deletedCount = deleteResponse.getDeleted();
447-
448443
LOGGER.info("Deleted {} documents matching StreamId: {} from index: {}, took {} seconds",
449444
deletedCount, streamId, indexName, deleteResponse.getTook().getSecondsFrac());
450445
}
@@ -508,49 +503,43 @@ private List<String> getTargetIndexNames(final RestHighLevelClient elasticClient
508503
* Index the current batch of documents
509504
*/
510505
private void indexDocuments() {
511-
if (indexRequests.size() > 0) {
506+
if (indexRequestsDocCount > 0) {
512507
final ElasticClusterDoc elasticCluster = elasticClusterStore.readDocument(clusterRef);
513508

514509
elasticClientCache.context(elasticCluster.getConnection(), elasticClient -> {
515510
try {
516511
// Create a new bulk indexing request, containing the current batch of documents
517-
final BulkRequest bulkRequest = new BulkRequest();
512+
final Request request = new Request("POST", "/_bulk");
518513

519514
// For each document, create an indexing request and append to the bulk request
520-
for (IndexRequest indexRequest : indexRequests) {
521-
bulkRequest.add(indexRequest);
515+
final String requestBody = String.join(System.lineSeparator(), indexRequests) +
516+
System.lineSeparator();
517+
request.setJsonEntity(requestBody);
518+
519+
// If an ingest pipeline name is specified, execute it when ingesting the document
520+
if (ingestPipelineName != null && !ingestPipelineName.isEmpty()) {
521+
request.addParameter("pipeline", ingestPipelineName);
522522
}
523523

524524
if (refreshAfterEachBatch) {
525525
// Refresh upon completion of the batch index request
526-
bulkRequest.setRefreshPolicy(RefreshPolicy.IMMEDIATE);
526+
request.addParameter("refresh", RefreshPolicy.IMMEDIATE.getValue());
527527
} else {
528528
// Only refresh after all batches have been indexed
529-
bulkRequest.setRefreshPolicy(RefreshPolicy.NONE);
529+
request.addParameter("refresh", RefreshPolicy.NONE.getValue());
530530
}
531531

532-
final BulkResponse response = elasticClient.bulk(bulkRequest, RequestOptions.DEFAULT);
533-
if (response.hasFailures()) {
534-
throw new IOException("Bulk index request failed: " + response.buildFailureMessage());
535-
} else {
536-
LOGGER.info("Indexed {} documents to Elasticsearch cluster: {}, took {} seconds",
537-
indexRequests.size(), elasticCluster.getName(), response.getTook().getSecondsFrac());
538-
}
539-
} catch (final RuntimeException e) {
532+
request.setOptions(RequestOptions.DEFAULT);
533+
final Instant start = Instant.now();
534+
elasticClient.getLowLevelClient().performRequest(request);
535+
final float tookSeconds = Duration.between(start, Instant.now()).toMillis() / 1000.0f;
536+
LOGGER.info("Indexed {} documents to Elasticsearch cluster: {}, took {} seconds",
537+
indexRequestsDocCount, elasticCluster.getName(), tookSeconds);
538+
} catch (final RuntimeException | IOException e) {
540539
fatalError(e.getMessage(), e);
541-
} catch (IOException e) {
542-
final String message = e.getMessage();
543-
// Elasticsearch v8.0.0 breaks the Java High Level REST Client `bulk` API, by sending back a
544-
// response that the API cannot handle, causing an exception.
545-
// This is a workaround, where we inspect the actual HTTP return code and if it's `200`, take
546-
// the request to have succeeded.
547-
// TODO: Review this once a compatible Elasticsearch Java client is released
548-
// @see https://github.com/elastic/elasticsearch/issues/84173
549-
if (message == null || !message.matches("^.+ response=HTTP/1\\.1 200 OK}$")) {
550-
fatalError(message, e);
551-
}
552540
} finally {
553541
indexRequests.clear();
542+
indexRequestsDocCount = 0;
554543
}
555544
});
556545
}

0 commit comments

Comments
 (0)