diff --git a/dat-core/src/main/java/ai/dat/core/contentstore/DefaultContentStore.java b/dat-core/src/main/java/ai/dat/core/contentstore/DefaultContentStore.java index b0af1cc..a6b601e 100644 --- a/dat-core/src/main/java/ai/dat/core/contentstore/DefaultContentStore.java +++ b/dat-core/src/main/java/ai/dat/core/contentstore/DefaultContentStore.java @@ -1,9 +1,6 @@ package ai.dat.core.contentstore; -import ai.dat.core.contentstore.data.BusinessKnowledgeIndexingMethod; -import ai.dat.core.contentstore.data.QuestionSqlPair; -import ai.dat.core.contentstore.data.SemanticModelIndexingMethod; -import ai.dat.core.contentstore.data.WordSynonymPair; +import ai.dat.core.contentstore.data.*; import ai.dat.core.contentstore.utils.ContentStoreUtil; import ai.dat.core.semantic.data.SemanticModel; import ai.dat.core.semantic.view.ElementView; @@ -99,6 +96,12 @@ public class DefaultContentStore implements ContentStore { private final Integer docGCEMaxChunkOverlap; private final String docGCEChunkRegex; + private final BusinessKnowledgeIndexingParentMode docPCCEParentMode; + private final Integer docPCCEParentMaxChunkSize; + private final String docPCCEParentChunkRegex; + private final Integer docPCCEChildMaxChunkSize; + private final String docPCCEChildChunkRegex; + private final Integer docMaxResults; private final Double docMinScore; // ------------------------------------------------------------------------------------------------------------- @@ -122,6 +125,9 @@ public DefaultContentStore(@NonNull EmbeddingModel embeddingModel, BusinessKnowledgeIndexingMethod docIndexingMethod, Integer docGCEMaxChunkSize, Integer docGCEMaxChunkOverlap, String docGCEChunkRegex, + BusinessKnowledgeIndexingParentMode docPCCEParentMode, + Integer docPCCEParentMaxChunkSize, String docPCCEParentChunkRegex, + Integer docPCCEChildMaxChunkSize, String docPCCEChildChunkRegex, Integer docMaxResults, Double docMinScore) { this.defaultChatModel = defaultChatModel; this.embeddingModel = embeddingModel; @@ -168,7 +174,7 @@ public DefaultContentStore(@NonNull EmbeddingModel embeddingModel, this.docIndexingMethod = Optional.ofNullable(docIndexingMethod) .orElse(BusinessKnowledgeIndexingMethod.FE); - this.docGCEMaxChunkSize = Optional.ofNullable(docGCEMaxChunkSize).orElse(4096); + this.docGCEMaxChunkSize = Optional.ofNullable(docGCEMaxChunkSize).orElse(512); Preconditions.checkArgument(this.docGCEMaxChunkSize > 0, "docGCEMaxChunkSize must be greater than 0"); this.docGCEMaxChunkOverlap = Optional.ofNullable(docGCEMaxChunkOverlap).orElse(0); @@ -178,6 +184,19 @@ public DefaultContentStore(@NonNull EmbeddingModel embeddingModel, "docGCEMaxChunkOverlap value must be less than docGCEMaxChunkSize value"); this.docGCEChunkRegex = docGCEChunkRegex; + this.docPCCEParentMode = Optional.ofNullable(docPCCEParentMode) + .orElse(BusinessKnowledgeIndexingParentMode.FULLTEXT); + this.docPCCEParentMaxChunkSize = Optional.ofNullable(docPCCEParentMaxChunkSize).orElse(1024); + Preconditions.checkArgument(this.docPCCEParentMaxChunkSize > 0, + "docPCCEParentMaxChunkSize must be greater than 0"); + this.docPCCEChildMaxChunkSize = Optional.ofNullable(docPCCEChildMaxChunkSize).orElse(512); + Preconditions.checkArgument(this.docPCCEChildMaxChunkSize > 0, + "docPCCEChildMaxChunkSize must be greater than 0"); + Preconditions.checkArgument(this.docPCCEParentMaxChunkSize > this.docPCCEChildMaxChunkSize, + "docPCCEChildMaxChunkSize value must be less than docPCCEParentMaxChunkSize value"); + this.docPCCEParentChunkRegex = docPCCEParentChunkRegex; + this.docPCCEChildChunkRegex = docPCCEChildChunkRegex; + this.docMaxResults = Optional.ofNullable(docMaxResults).orElse(this.maxResults); Preconditions.checkArgument(this.docMaxResults <= 100 && this.docMaxResults >= 1, "docMaxResults must be between 1 and 100"); @@ -198,14 +217,13 @@ public List addMdls(List semanticModels) { } private List addMdlsForHyQE(List semanticModels) { - return semanticModels.stream() - .map(semanticModel -> { + return semanticModels.stream().flatMap(semanticModel -> { SemanticModelUtil.validateSemanticModel(semanticModel); String semanticModelViewText = SemanticModelUtil.toSemanticModelViewText(semanticModel); List questions = mdlHyQEAssistant.genHypotheticalQuestions( mdlHyQEInstruction, mdlHyQEQuestions, semanticModelViewText); if (questions == null || questions.isEmpty()) { - return null; + return Stream.empty(); } String json; try { @@ -217,13 +235,9 @@ private List addMdlsForHyQE(List semanticModels) { TextSegment textSegment = TextSegment.from(json, MDL_METADATA); List embedTextSegments = questions.stream().map(TextSegment::from).toList(); List embeddings = embeddingModel.embedAll(embedTextSegments).content(); - List textSegments = questions.stream() - .map(question -> textSegment) - .collect(Collectors.toList()); - return mdlEmbeddingStore.addAll(embeddings, textSegments); + List textSegments = embeddings.stream().map(o -> textSegment).collect(Collectors.toList()); + return mdlEmbeddingStore.addAll(embeddings, textSegments).stream(); }) - .filter(Objects::nonNull) - .flatMap(Collection::stream) .collect(Collectors.toList()); } @@ -236,8 +250,7 @@ List genHypotheticalQuestions(@V("instruction") String instruction, } private List addMdlsForCE(List semanticModels) { - return semanticModels.stream() - .map(semanticModel -> { + return semanticModels.stream().flatMap(semanticModel -> { SemanticModelUtil.validateSemanticModel(semanticModel); SemanticModelView semanticModelView = SemanticModelUtil.toSemanticModelView(semanticModel); List columnTexts = Stream.of( @@ -265,13 +278,9 @@ private List addMdlsForCE(List semanticModels) { TextSegment textSegment = TextSegment.from(json, MDL_METADATA); List embedTextSegments = columnTexts.stream().map(TextSegment::from).toList(); List embeddings = embeddingModel.embedAll(embedTextSegments).content(); - List textSegments = columnTexts.stream() - .map(question -> textSegment) - .collect(Collectors.toList()); - return mdlEmbeddingStore.addAll(embeddings, textSegments); + List textSegments = embeddings.stream().map(o -> textSegment).collect(Collectors.toList()); + return mdlEmbeddingStore.addAll(embeddings, textSegments).stream(); }) - .filter(Objects::nonNull) - .flatMap(Collection::stream) .collect(Collectors.toList()); } @@ -503,10 +512,42 @@ public void removeAllSyns() { public List addDocs(List docs) { if (BusinessKnowledgeIndexingMethod.GCE == docIndexingMethod) { return addDocsForGCE(docs); + } else if (BusinessKnowledgeIndexingMethod.PCCE == docIndexingMethod) { + return addDocsForPCCE(docs); } return addDocsForFE(docs); } + private List addDocsForPCCE(List docs) { + DocumentSplitter parentSplitter = null; + if (BusinessKnowledgeIndexingParentMode.PARAGRAPH == docPCCEParentMode) { + parentSplitter = DocumentSplitters.recursive(docPCCEParentMaxChunkSize, 0); + if (docPCCEParentChunkRegex != null) { + parentSplitter = new DocumentByRegexSplitter(docPCCEParentChunkRegex, "\n\n", + docPCCEParentMaxChunkSize, 0, parentSplitter); + } + } + DocumentSplitter childSplitter = DocumentSplitters.recursive(docPCCEChildMaxChunkSize, 0); + if (docPCCEChildChunkRegex != null) { + childSplitter = new DocumentByRegexSplitter(docPCCEChildChunkRegex, "\n", + docPCCEChildMaxChunkSize, 0, childSplitter); + } + DocumentSplitter finalParentSplitter = parentSplitter; + DocumentSplitter finalChildSplitter = childSplitter; + return docs.stream().flatMap(text -> { + List parentTexts = BusinessKnowledgeIndexingParentMode.PARAGRAPH == docPCCEParentMode ? + finalParentSplitter.split(Document.document(text)).stream().map(TextSegment::text).toList() : + Collections.singletonList(text); + return parentTexts.stream().flatMap(parentText -> { + TextSegment textSegment = TextSegment.from(parentText, DOC_METADATA); + List embedTextSegments = finalChildSplitter.split(Document.document(parentText)); + List embeddings = embeddingModel.embedAll(embedTextSegments).content(); + List textSegments = embeddings.stream().map(o -> textSegment).collect(Collectors.toList()); + return docEmbeddingStore.addAll(embeddings, textSegments).stream(); + }); + }).collect(Collectors.toList()); + } + private List addDocsForGCE(List docs) { DocumentSplitter splitter = DocumentSplitters.recursive(docGCEMaxChunkSize, docGCEMaxChunkOverlap); if (docGCEChunkRegex != null) { diff --git a/dat-core/src/main/java/ai/dat/core/contentstore/data/BusinessKnowledgeIndexingMethod.java b/dat-core/src/main/java/ai/dat/core/contentstore/data/BusinessKnowledgeIndexingMethod.java index ba04431..81dbe55 100644 --- a/dat-core/src/main/java/ai/dat/core/contentstore/data/BusinessKnowledgeIndexingMethod.java +++ b/dat-core/src/main/java/ai/dat/core/contentstore/data/BusinessKnowledgeIndexingMethod.java @@ -9,7 +9,8 @@ @Getter public enum BusinessKnowledgeIndexingMethod { FE("Full Embeddings"), - GCE("General Chunking Embeddings"); + GCE("General Chunking Embeddings"), + PCCE("Parent-child Chunking Embeddings"); private final String description; diff --git a/dat-core/src/main/java/ai/dat/core/contentstore/data/BusinessKnowledgeIndexingParentMode.java b/dat-core/src/main/java/ai/dat/core/contentstore/data/BusinessKnowledgeIndexingParentMode.java new file mode 100644 index 0000000..f45fcce --- /dev/null +++ b/dat-core/src/main/java/ai/dat/core/contentstore/data/BusinessKnowledgeIndexingParentMode.java @@ -0,0 +1,20 @@ +package ai.dat.core.contentstore.data; + +import lombok.Getter; + +/** + * @Author JunjieM + * @Date 2025/10/23 + */ +@Getter +public enum BusinessKnowledgeIndexingParentMode { + FULLTEXT("The entire text is used as the parent chunk and retrieved directly."), + PARAGRAPH("This mode splits the text in to paragraphs based on regular expression " + + "and the maximum chunk length, using the split text as the parent chunk for retrieval."); + + private final String description; + + BusinessKnowledgeIndexingParentMode(String description) { + this.description = description; + } +} diff --git a/dat-core/src/main/java/ai/dat/core/factories/DefaultContentStoreFactory.java b/dat-core/src/main/java/ai/dat/core/factories/DefaultContentStoreFactory.java index aab53bf..20e70f3 100644 --- a/dat-core/src/main/java/ai/dat/core/factories/DefaultContentStoreFactory.java +++ b/dat-core/src/main/java/ai/dat/core/factories/DefaultContentStoreFactory.java @@ -6,6 +6,7 @@ import ai.dat.core.contentstore.ContentStore; import ai.dat.core.contentstore.DefaultContentStore; import ai.dat.core.contentstore.data.BusinessKnowledgeIndexingMethod; +import ai.dat.core.contentstore.data.BusinessKnowledgeIndexingParentMode; import ai.dat.core.contentstore.data.SemanticModelIndexingMethod; import ai.dat.core.factories.data.ChatModelInstance; import ai.dat.core.utils.FactoryUtil; @@ -124,7 +125,7 @@ public class DefaultContentStoreFactory implements ContentStoreFactory { public static final ConfigOption BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE = ConfigOptions.key("business-knowledge.indexing.gce-max-chunk-size") .intType() - .defaultValue(4096) + .defaultValue(512) .withDescription("Business knowledge `GCE` indexing method maximum chunk length."); public static final ConfigOption BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP = @@ -140,6 +141,41 @@ public class DefaultContentStoreFactory implements ContentStoreFactory { .withDescription("Business knowledge `GCE` indexing method split chunk regular expression. " + "When it is empty, use the default built-in recursive split method."); + public static final ConfigOption BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MODE = + ConfigOptions.key("business-knowledge.indexing.pcce-parent-mode") + .enumType(BusinessKnowledgeIndexingParentMode.class) + .defaultValue(BusinessKnowledgeIndexingParentMode.FULLTEXT) + .withDescription("Business knowledge `PCCE` indexing method parent chunk mode.\n" + + Arrays.stream(BusinessKnowledgeIndexingParentMode.values()) + .map(e -> e.name() + ": " + e.getDescription()) + .collect(Collectors.joining("\n"))); + + public static final ConfigOption BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE = + ConfigOptions.key("business-knowledge.indexing.pcce-parent-max-chunk-size") + .intType() + .defaultValue(1024) + .withDescription("Business knowledge `PCCE` indexing method parent maximum chunk length."); + + public static final ConfigOption BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_CHUNK_REGEX = + ConfigOptions.key("business-knowledge.indexing.pcce-parent-chunk-regex") + .stringType() + .noDefaultValue() + .withDescription("Business knowledge `PCCE` indexing method split parent chunk regular expression. " + + "When it is empty, use the default built-in recursive split method."); + + public static final ConfigOption BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE = + ConfigOptions.key("business-knowledge.indexing.pcce-child-max-chunk-size") + .intType() + .defaultValue(512) + .withDescription("Business knowledge `PCCE` indexing method child maximum chunk length."); + + public static final ConfigOption BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_CHUNK_REGEX = + ConfigOptions.key("business-knowledge.indexing.pcce-child-chunk-regex") + .stringType() + .noDefaultValue() + .withDescription("Business knowledge `PCCE` indexing method split child chunk regular expression. " + + "When it is empty, use the default built-in recursive split method."); + public static final ConfigOption BUSINESS_KNOWLEDGE_RETRIEVAL_MAX_RESULTS = ConfigOptions.key("business-knowledge.retrieval.max-results") .intType() @@ -254,6 +290,11 @@ public Set> optionalOptions() { BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE, BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP, BUSINESS_KNOWLEDGE_INDEXING_GCE_CHUNK_REGEX, + BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MODE, + BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE, + BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_CHUNK_REGEX, + BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE, + BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_CHUNK_REGEX, BUSINESS_KNOWLEDGE_RETRIEVAL_MAX_RESULTS, BUSINESS_KNOWLEDGE_RETRIEVAL_MIN_SCORE )); @@ -271,6 +312,11 @@ public Set> fingerprintOptions() { BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE, BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP, BUSINESS_KNOWLEDGE_INDEXING_GCE_CHUNK_REGEX, + BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MODE, + BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE, + BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_CHUNK_REGEX, + BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE, + BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_CHUNK_REGEX, // ------------------ Deprecated ------------------- SEMANTIC_MODEL_RETRIEVAL_STRATEGY, @@ -401,6 +447,12 @@ public ContentStore create(@NonNull ReadableConfig config, config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE).ifPresent(builder::docGCEMaxChunkSize); config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP).ifPresent(builder::docGCEMaxChunkOverlap); config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_GCE_CHUNK_REGEX).ifPresent(builder::docGCEChunkRegex); + } else if (BusinessKnowledgeIndexingMethod.PCCE == businessKnowledgeIndexingMethod) { + config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MODE).ifPresent(builder::docPCCEParentMode); + config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE).ifPresent(builder::docPCCEParentMaxChunkSize); + config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_CHUNK_REGEX).ifPresent(builder::docPCCEParentChunkRegex); + config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE).ifPresent(builder::docPCCEChildMaxChunkSize); + config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_CHUNK_REGEX).ifPresent(builder::docPCCEChildChunkRegex); } config.getOptional(BUSINESS_KNOWLEDGE_RETRIEVAL_MAX_RESULTS).ifPresent(builder::docMaxResults); config.getOptional(BUSINESS_KNOWLEDGE_RETRIEVAL_MIN_SCORE).ifPresent(builder::docMinScore); @@ -451,10 +503,19 @@ private void validateConfigOptions(ReadableConfig config, Preconditions.checkArgument(businessKnowledgeGCEMaxChunkSize > 0, "'" + BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE.key() + "' value must be greater than 0"); Preconditions.checkArgument(businessKnowledgeGCEMaxChunkOverlap >= 0, - "'" + BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP.key() + "' value must be greater than than or equal to 0"); + "'" + BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP.key() + "' value must be greater than or equal to 0"); Preconditions.checkArgument(businessKnowledgeGCEMaxChunkSize > businessKnowledgeGCEMaxChunkOverlap, - "'" + BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE.key() + "' value must be less than '" - + BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP.key() + "' value"); + "'" + BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP.key() + "' value must be less than '" + + BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE.key() + "' value"); + Integer businessKnowledgePCCEParentMaxChunkSize = config.get(BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE); + Integer businessKnowledgePCCEChildMaxChunkSize = config.get(BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE); + Preconditions.checkArgument(businessKnowledgePCCEParentMaxChunkSize > 0, + "'" + BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE.key() + "' value must be greater than 0"); + Preconditions.checkArgument(businessKnowledgePCCEChildMaxChunkSize > 0, + "'" + BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE.key() + "' value must be greater than 0"); + Preconditions.checkArgument(businessKnowledgePCCEParentMaxChunkSize > businessKnowledgePCCEChildMaxChunkSize, + "'" + BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE.key() + "' value must be less than '" + + BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE.key() + "' value"); config.getOptional(BUSINESS_KNOWLEDGE_RETRIEVAL_MAX_RESULTS) .ifPresent(n -> Preconditions.checkArgument(n >= 1 && n <= 100, "'" + BUSINESS_KNOWLEDGE_RETRIEVAL_MAX_RESULTS.key() + "' value must be between 1 and 100"));