Fix NPE in TextReader source metadata

alexcheng1982 · markpollack · commit 05292ac730dc · 2024-10-03T17:11:05.000-04:00
This commit addresses the NPE issue in TextReader's source metadata handling. It introduces a new method getResourceIdentifier() to robustly extract identifiers from various Resource types. The fix ensures that: 1. Filename is used if available 2. Falls back to URI, then URL if filename is not present 3. Uses resource description as a last resort Additionally, the commit includes updated tests to verify the behavior with different Resource types, particularly ByteArrayResource This change prevents NPEs when dealing with Resources that lack certain properties, improving the overall reliability of TextReader. Fixes spring-projects#1386
diff --git a/spring-ai-core/src/main/java/org/springframework/ai/reader/TextReader.java b/spring-ai-core/src/main/java/org/springframework/ai/reader/TextReader.java
@@ -16,6 +16,8 @@
 package org.springframework.ai.reader;
 
 import java.io.IOException;
+import java.net.URI;
+import java.net.URL;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
@@ -45,11 +47,11 @@ public class TextReader implements DocumentReader {
 	private final Resource resource;
 
 	/**
-	 * @return Character set to be used when loading data from the
+	 * Character set to be used when loading data from the
 	 */
 	private Charset charset = StandardCharsets.UTF_8;
 
-	private Map<String, Object> customMetadata = new HashMap<>();
+	private final Map<String, Object> customMetadata = new HashMap<>();
 
 	public TextReader(String resourceUrl) {
 		this(new DefaultResourceLoader().getResource(resourceUrl));
@@ -86,6 +88,7 @@ public List<Document> get() {
 			// Inject source information as a metadata.
 			this.customMetadata.put(CHARSET_METADATA, this.charset.name());
 			this.customMetadata.put(SOURCE_METADATA, this.resource.getFilename());
+			this.customMetadata.put(SOURCE_METADATA, getResourceIdentifier(this.resource));
 
 			return List.of(new Document(document, this.customMetadata));
 
@@ -95,4 +98,37 @@ public List<Document> get() {
 		}
 	}
 
+	protected String getResourceIdentifier(Resource resource) {
+		// Try to get the filename first
+		String filename = resource.getFilename();
+		if (filename != null && !filename.isEmpty()) {
+			return filename;
+		}
+
+		// Try to get the URI
+		try {
+			URI uri = resource.getURI();
+			if (uri != null) {
+				return uri.toString();
+			}
+		}
+		catch (IOException ignored) {
+			// If getURI() throws an exception, we'll try the next method
+		}
+
+		// Try to get the URL
+		try {
+			URL url = resource.getURL();
+			if (url != null) {
+				return url.toString();
+			}
+		}
+		catch (IOException ignored) {
+			// If getURL() throws an exception, we'll fall back to getDescription()
+		}
+
+		// If all else fails, use the description
+		return resource.getDescription();
+	}
+
 }
diff --git a/spring-ai-core/src/test/java/org/springframework/ai/reader/TextReaderTests.java b/spring-ai-core/src/test/java/org/springframework/ai/reader/TextReaderTests.java
@@ -15,26 +15,33 @@
  */
 package org.springframework.ai.reader;
 
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 
 import org.junit.jupiter.api.Test;
 
 import org.springframework.ai.document.Document;
 import org.springframework.ai.transformer.splitter.TokenTextSplitter;
+import org.springframework.core.io.ByteArrayResource;
 import org.springframework.core.io.DefaultResourceLoader;
+import org.springframework.core.io.FileSystemResource;
 import org.springframework.core.io.Resource;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
 /**
  * @author Christian Tzolov
+ * @author Mark Pollack
  */
 public class TextReaderTests {
 
-	private Resource resource = new DefaultResourceLoader().getResource("classpath:text_source.txt");
-
 	@Test
 	void loadText() {
+		Resource resource = new DefaultResourceLoader().getResource("classpath:text_source.txt");
 		assertThat(resource).isNotNull();
 		TextReader textReader = new TextReader(resource);
 		textReader.getCustomMetadata().put("customKey", "Value");
@@ -53,4 +60,49 @@ void loadText() {
 		}
 	}
 
+	@Test
+	void loadTextFromByteArrayResource() {
+		// Test with default constructor
+		Resource defaultByteArrayResource = new ByteArrayResource("Test content".getBytes(StandardCharsets.UTF_8));
+		assertThat(defaultByteArrayResource).isNotNull();
+		TextReader defaultTextReader = new TextReader(defaultByteArrayResource);
+		defaultTextReader.getCustomMetadata().put("customKey", "DefaultValue");
+
+		List<Document> defaultDocuments = defaultTextReader.get();
+
+		assertThat(defaultDocuments).hasSize(1);
+
+		Document defaultDocument = defaultDocuments.get(0);
+		assertThat(defaultDocument.getMetadata()).containsEntry("customKey", "DefaultValue")
+			.containsEntry(TextReader.CHARSET_METADATA, "UTF-8");
+
+		// Assert on the SOURCE_METADATA for default ByteArrayResource
+		assertThat(defaultDocument.getMetadata().get(TextReader.SOURCE_METADATA))
+			.isEqualTo("Byte array resource [resource loaded from byte array]");
+
+		assertThat(defaultDocument.getContent()).isEqualTo("Test content");
+
+		// Test with custom description constructor
+		String customDescription = "Custom byte array resource";
+		Resource customByteArrayResource = new ByteArrayResource(
+				"Another test content".getBytes(StandardCharsets.UTF_8), customDescription);
+		assertThat(customByteArrayResource).isNotNull();
+		TextReader customTextReader = new TextReader(customByteArrayResource);
+		customTextReader.getCustomMetadata().put("customKey", "CustomValue");
+
+		List<Document> customDocuments = customTextReader.get();
+
+		assertThat(customDocuments).hasSize(1);
+
+		Document customDocument = customDocuments.get(0);
+		assertThat(customDocument.getMetadata()).containsEntry("customKey", "CustomValue")
+			.containsEntry(TextReader.CHARSET_METADATA, "UTF-8");
+
+		// Assert on the SOURCE_METADATA for custom ByteArrayResource
+		assertThat(customDocument.getMetadata().get(TextReader.SOURCE_METADATA))
+			.isEqualTo("Byte array resource [Custom byte array resource]");
+
+		assertThat(customDocument.getContent()).isEqualTo("Another test content");
+	}
+
 }