Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
package ai.dat.core.contentstore;

import ai.dat.core.contentstore.data.BusinessKnowledgeIndexingMethod;
import ai.dat.core.contentstore.data.QuestionSqlPair;
import ai.dat.core.contentstore.data.SemanticModelIndexingMethod;
import ai.dat.core.contentstore.data.WordSynonymPair;
import ai.dat.core.contentstore.data.*;
import ai.dat.core.contentstore.utils.ContentStoreUtil;
import ai.dat.core.semantic.data.SemanticModel;
import ai.dat.core.semantic.view.ElementView;
Expand Down Expand Up @@ -99,6 +96,12 @@ public class DefaultContentStore implements ContentStore {
private final Integer docGCEMaxChunkOverlap;
private final String docGCEChunkRegex;

private final BusinessKnowledgeIndexingParentMode docPCCEParentMode;
private final Integer docPCCEParentMaxChunkSize;
private final String docPCCEParentChunkRegex;
private final Integer docPCCEChildMaxChunkSize;
private final String docPCCEChildChunkRegex;

private final Integer docMaxResults;
private final Double docMinScore;
// -------------------------------------------------------------------------------------------------------------
Expand All @@ -122,6 +125,9 @@ public DefaultContentStore(@NonNull EmbeddingModel embeddingModel,
BusinessKnowledgeIndexingMethod docIndexingMethod,
Integer docGCEMaxChunkSize, Integer docGCEMaxChunkOverlap,
String docGCEChunkRegex,
BusinessKnowledgeIndexingParentMode docPCCEParentMode,
Integer docPCCEParentMaxChunkSize, String docPCCEParentChunkRegex,
Integer docPCCEChildMaxChunkSize, String docPCCEChildChunkRegex,
Integer docMaxResults, Double docMinScore) {
this.defaultChatModel = defaultChatModel;
this.embeddingModel = embeddingModel;
Expand Down Expand Up @@ -168,7 +174,7 @@ public DefaultContentStore(@NonNull EmbeddingModel embeddingModel,
this.docIndexingMethod = Optional.ofNullable(docIndexingMethod)
.orElse(BusinessKnowledgeIndexingMethod.FE);

this.docGCEMaxChunkSize = Optional.ofNullable(docGCEMaxChunkSize).orElse(4096);
this.docGCEMaxChunkSize = Optional.ofNullable(docGCEMaxChunkSize).orElse(512);
Preconditions.checkArgument(this.docGCEMaxChunkSize > 0,
"docGCEMaxChunkSize must be greater than 0");
this.docGCEMaxChunkOverlap = Optional.ofNullable(docGCEMaxChunkOverlap).orElse(0);
Expand All @@ -178,6 +184,19 @@ public DefaultContentStore(@NonNull EmbeddingModel embeddingModel,
"docGCEMaxChunkOverlap value must be less than docGCEMaxChunkSize value");
this.docGCEChunkRegex = docGCEChunkRegex;

this.docPCCEParentMode = Optional.ofNullable(docPCCEParentMode)
.orElse(BusinessKnowledgeIndexingParentMode.FULLTEXT);
this.docPCCEParentMaxChunkSize = Optional.ofNullable(docPCCEParentMaxChunkSize).orElse(1024);
Preconditions.checkArgument(this.docPCCEParentMaxChunkSize > 0,
"docPCCEParentMaxChunkSize must be greater than 0");
this.docPCCEChildMaxChunkSize = Optional.ofNullable(docPCCEChildMaxChunkSize).orElse(512);
Preconditions.checkArgument(this.docPCCEChildMaxChunkSize > 0,
"docPCCEChildMaxChunkSize must be greater than 0");
Preconditions.checkArgument(this.docPCCEParentMaxChunkSize > this.docPCCEChildMaxChunkSize,
"docPCCEChildMaxChunkSize value must be less than docPCCEParentMaxChunkSize value");
this.docPCCEParentChunkRegex = docPCCEParentChunkRegex;
this.docPCCEChildChunkRegex = docPCCEChildChunkRegex;

this.docMaxResults = Optional.ofNullable(docMaxResults).orElse(this.maxResults);
Preconditions.checkArgument(this.docMaxResults <= 100 && this.docMaxResults >= 1,
"docMaxResults must be between 1 and 100");
Expand All @@ -198,14 +217,13 @@ public List<String> addMdls(List<SemanticModel> semanticModels) {
}

private List<String> addMdlsForHyQE(List<SemanticModel> semanticModels) {
return semanticModels.stream()
.map(semanticModel -> {
return semanticModels.stream().flatMap(semanticModel -> {
SemanticModelUtil.validateSemanticModel(semanticModel);
String semanticModelViewText = SemanticModelUtil.toSemanticModelViewText(semanticModel);
List<String> questions = mdlHyQEAssistant.genHypotheticalQuestions(
mdlHyQEInstruction, mdlHyQEQuestions, semanticModelViewText);
if (questions == null || questions.isEmpty()) {
return null;
return Stream.empty();
}
String json;
try {
Expand All @@ -217,13 +235,9 @@ private List<String> addMdlsForHyQE(List<SemanticModel> semanticModels) {
TextSegment textSegment = TextSegment.from(json, MDL_METADATA);
List<TextSegment> embedTextSegments = questions.stream().map(TextSegment::from).toList();
List<Embedding> embeddings = embeddingModel.embedAll(embedTextSegments).content();
List<TextSegment> textSegments = questions.stream()
.map(question -> textSegment)
.collect(Collectors.toList());
return mdlEmbeddingStore.addAll(embeddings, textSegments);
List<TextSegment> textSegments = embeddings.stream().map(o -> textSegment).collect(Collectors.toList());
return mdlEmbeddingStore.addAll(embeddings, textSegments).stream();
})
.filter(Objects::nonNull)
.flatMap(Collection::stream)
.collect(Collectors.toList());
}

Expand All @@ -236,8 +250,7 @@ List<String> genHypotheticalQuestions(@V("instruction") String instruction,
}

private List<String> addMdlsForCE(List<SemanticModel> semanticModels) {
return semanticModels.stream()
.map(semanticModel -> {
return semanticModels.stream().flatMap(semanticModel -> {
SemanticModelUtil.validateSemanticModel(semanticModel);
SemanticModelView semanticModelView = SemanticModelUtil.toSemanticModelView(semanticModel);
List<String> columnTexts = Stream.of(
Expand Down Expand Up @@ -265,13 +278,9 @@ private List<String> addMdlsForCE(List<SemanticModel> semanticModels) {
TextSegment textSegment = TextSegment.from(json, MDL_METADATA);
List<TextSegment> embedTextSegments = columnTexts.stream().map(TextSegment::from).toList();
List<Embedding> embeddings = embeddingModel.embedAll(embedTextSegments).content();
List<TextSegment> textSegments = columnTexts.stream()
.map(question -> textSegment)
.collect(Collectors.toList());
return mdlEmbeddingStore.addAll(embeddings, textSegments);
List<TextSegment> textSegments = embeddings.stream().map(o -> textSegment).collect(Collectors.toList());
return mdlEmbeddingStore.addAll(embeddings, textSegments).stream();
})
.filter(Objects::nonNull)
.flatMap(Collection::stream)
.collect(Collectors.toList());
}

Expand Down Expand Up @@ -503,10 +512,42 @@ public void removeAllSyns() {
public List<String> addDocs(List<String> docs) {
if (BusinessKnowledgeIndexingMethod.GCE == docIndexingMethod) {
return addDocsForGCE(docs);
} else if (BusinessKnowledgeIndexingMethod.PCCE == docIndexingMethod) {
return addDocsForPCCE(docs);
}
return addDocsForFE(docs);
}

private List<String> addDocsForPCCE(List<String> docs) {
DocumentSplitter parentSplitter = null;
if (BusinessKnowledgeIndexingParentMode.PARAGRAPH == docPCCEParentMode) {
parentSplitter = DocumentSplitters.recursive(docPCCEParentMaxChunkSize, 0);
if (docPCCEParentChunkRegex != null) {
parentSplitter = new DocumentByRegexSplitter(docPCCEParentChunkRegex, "\n\n",
docPCCEParentMaxChunkSize, 0, parentSplitter);
}
}
DocumentSplitter childSplitter = DocumentSplitters.recursive(docPCCEChildMaxChunkSize, 0);
if (docPCCEChildChunkRegex != null) {
childSplitter = new DocumentByRegexSplitter(docPCCEChildChunkRegex, "\n",
docPCCEChildMaxChunkSize, 0, childSplitter);
}
DocumentSplitter finalParentSplitter = parentSplitter;
DocumentSplitter finalChildSplitter = childSplitter;
return docs.stream().flatMap(text -> {
List<String> parentTexts = BusinessKnowledgeIndexingParentMode.PARAGRAPH == docPCCEParentMode ?
finalParentSplitter.split(Document.document(text)).stream().map(TextSegment::text).toList() :
Collections.singletonList(text);
return parentTexts.stream().flatMap(parentText -> {
TextSegment textSegment = TextSegment.from(parentText, DOC_METADATA);
List<TextSegment> embedTextSegments = finalChildSplitter.split(Document.document(parentText));
List<Embedding> embeddings = embeddingModel.embedAll(embedTextSegments).content();
List<TextSegment> textSegments = embeddings.stream().map(o -> textSegment).collect(Collectors.toList());
return docEmbeddingStore.addAll(embeddings, textSegments).stream();
});
}).collect(Collectors.toList());
}

private List<String> addDocsForGCE(List<String> docs) {
DocumentSplitter splitter = DocumentSplitters.recursive(docGCEMaxChunkSize, docGCEMaxChunkOverlap);
if (docGCEChunkRegex != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
@Getter
public enum BusinessKnowledgeIndexingMethod {
FE("Full Embeddings"),
GCE("General Chunking Embeddings");
GCE("General Chunking Embeddings"),
PCCE("Parent-child Chunking Embeddings");

private final String description;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package ai.dat.core.contentstore.data;

import lombok.Getter;

/**
* @Author JunjieM
* @Date 2025/10/23
*/
@Getter
public enum BusinessKnowledgeIndexingParentMode {
FULLTEXT("The entire text is used as the parent chunk and retrieved directly."),
PARAGRAPH("This mode splits the text in to paragraphs based on regular expression " +
"and the maximum chunk length, using the split text as the parent chunk for retrieval.");

private final String description;

BusinessKnowledgeIndexingParentMode(String description) {
this.description = description;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import ai.dat.core.contentstore.ContentStore;
import ai.dat.core.contentstore.DefaultContentStore;
import ai.dat.core.contentstore.data.BusinessKnowledgeIndexingMethod;
import ai.dat.core.contentstore.data.BusinessKnowledgeIndexingParentMode;
import ai.dat.core.contentstore.data.SemanticModelIndexingMethod;
import ai.dat.core.factories.data.ChatModelInstance;
import ai.dat.core.utils.FactoryUtil;
Expand Down Expand Up @@ -124,7 +125,7 @@ public class DefaultContentStoreFactory implements ContentStoreFactory {
public static final ConfigOption<Integer> BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE =
ConfigOptions.key("business-knowledge.indexing.gce-max-chunk-size")
.intType()
.defaultValue(4096)
.defaultValue(512)
.withDescription("Business knowledge `GCE` indexing method maximum chunk length.");

public static final ConfigOption<Integer> BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP =
Expand All @@ -140,6 +141,41 @@ public class DefaultContentStoreFactory implements ContentStoreFactory {
.withDescription("Business knowledge `GCE` indexing method split chunk regular expression. " +
"When it is empty, use the default built-in recursive split method.");

public static final ConfigOption<BusinessKnowledgeIndexingParentMode> BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MODE =
ConfigOptions.key("business-knowledge.indexing.pcce-parent-mode")
.enumType(BusinessKnowledgeIndexingParentMode.class)
.defaultValue(BusinessKnowledgeIndexingParentMode.FULLTEXT)
.withDescription("Business knowledge `PCCE` indexing method parent chunk mode.\n" +
Arrays.stream(BusinessKnowledgeIndexingParentMode.values())
.map(e -> e.name() + ": " + e.getDescription())
.collect(Collectors.joining("\n")));

public static final ConfigOption<Integer> BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE =
ConfigOptions.key("business-knowledge.indexing.pcce-parent-max-chunk-size")
.intType()
.defaultValue(1024)
.withDescription("Business knowledge `PCCE` indexing method parent maximum chunk length.");

public static final ConfigOption<String> BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_CHUNK_REGEX =
ConfigOptions.key("business-knowledge.indexing.pcce-parent-chunk-regex")
.stringType()
.noDefaultValue()
.withDescription("Business knowledge `PCCE` indexing method split parent chunk regular expression. " +
"When it is empty, use the default built-in recursive split method.");

public static final ConfigOption<Integer> BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE =
ConfigOptions.key("business-knowledge.indexing.pcce-child-max-chunk-size")
.intType()
.defaultValue(512)
.withDescription("Business knowledge `PCCE` indexing method child maximum chunk length.");

public static final ConfigOption<String> BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_CHUNK_REGEX =
ConfigOptions.key("business-knowledge.indexing.pcce-child-chunk-regex")
.stringType()
.noDefaultValue()
.withDescription("Business knowledge `PCCE` indexing method split child chunk regular expression. " +
"When it is empty, use the default built-in recursive split method.");

public static final ConfigOption<Integer> BUSINESS_KNOWLEDGE_RETRIEVAL_MAX_RESULTS =
ConfigOptions.key("business-knowledge.retrieval.max-results")
.intType()
Expand Down Expand Up @@ -254,6 +290,11 @@ public Set<ConfigOption<?>> optionalOptions() {
BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE,
BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP,
BUSINESS_KNOWLEDGE_INDEXING_GCE_CHUNK_REGEX,
BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MODE,
BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE,
BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_CHUNK_REGEX,
BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE,
BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_CHUNK_REGEX,
BUSINESS_KNOWLEDGE_RETRIEVAL_MAX_RESULTS,
BUSINESS_KNOWLEDGE_RETRIEVAL_MIN_SCORE
));
Expand All @@ -271,6 +312,11 @@ public Set<ConfigOption<?>> fingerprintOptions() {
BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE,
BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP,
BUSINESS_KNOWLEDGE_INDEXING_GCE_CHUNK_REGEX,
BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MODE,
BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE,
BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_CHUNK_REGEX,
BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE,
BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_CHUNK_REGEX,

// ------------------ Deprecated -------------------
SEMANTIC_MODEL_RETRIEVAL_STRATEGY,
Expand Down Expand Up @@ -401,6 +447,12 @@ public ContentStore create(@NonNull ReadableConfig config,
config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE).ifPresent(builder::docGCEMaxChunkSize);
config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP).ifPresent(builder::docGCEMaxChunkOverlap);
config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_GCE_CHUNK_REGEX).ifPresent(builder::docGCEChunkRegex);
} else if (BusinessKnowledgeIndexingMethod.PCCE == businessKnowledgeIndexingMethod) {
config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MODE).ifPresent(builder::docPCCEParentMode);
config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE).ifPresent(builder::docPCCEParentMaxChunkSize);
config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_CHUNK_REGEX).ifPresent(builder::docPCCEParentChunkRegex);
config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE).ifPresent(builder::docPCCEChildMaxChunkSize);
config.getOptional(BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_CHUNK_REGEX).ifPresent(builder::docPCCEChildChunkRegex);
}
config.getOptional(BUSINESS_KNOWLEDGE_RETRIEVAL_MAX_RESULTS).ifPresent(builder::docMaxResults);
config.getOptional(BUSINESS_KNOWLEDGE_RETRIEVAL_MIN_SCORE).ifPresent(builder::docMinScore);
Expand Down Expand Up @@ -451,10 +503,19 @@ private void validateConfigOptions(ReadableConfig config,
Preconditions.checkArgument(businessKnowledgeGCEMaxChunkSize > 0,
"'" + BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE.key() + "' value must be greater than 0");
Preconditions.checkArgument(businessKnowledgeGCEMaxChunkOverlap >= 0,
"'" + BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP.key() + "' value must be greater than than or equal to 0");
"'" + BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP.key() + "' value must be greater than or equal to 0");
Preconditions.checkArgument(businessKnowledgeGCEMaxChunkSize > businessKnowledgeGCEMaxChunkOverlap,
"'" + BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE.key() + "' value must be less than '"
+ BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP.key() + "' value");
"'" + BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_OVERLAP.key() + "' value must be less than '"
+ BUSINESS_KNOWLEDGE_INDEXING_GCE_MAX_CHUNK_SIZE.key() + "' value");
Integer businessKnowledgePCCEParentMaxChunkSize = config.get(BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE);
Integer businessKnowledgePCCEChildMaxChunkSize = config.get(BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE);
Preconditions.checkArgument(businessKnowledgePCCEParentMaxChunkSize > 0,
"'" + BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE.key() + "' value must be greater than 0");
Preconditions.checkArgument(businessKnowledgePCCEChildMaxChunkSize > 0,
"'" + BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE.key() + "' value must be greater than 0");
Preconditions.checkArgument(businessKnowledgePCCEParentMaxChunkSize > businessKnowledgePCCEChildMaxChunkSize,
"'" + BUSINESS_KNOWLEDGE_INDEXING_PCCE_CHILD_MAX_CHUNK_SIZE.key() + "' value must be less than '"
+ BUSINESS_KNOWLEDGE_INDEXING_PCCE_PARENT_MAX_CHUNK_SIZE.key() + "' value");
config.getOptional(BUSINESS_KNOWLEDGE_RETRIEVAL_MAX_RESULTS)
.ifPresent(n -> Preconditions.checkArgument(n >= 1 && n <= 100,
"'" + BUSINESS_KNOWLEDGE_RETRIEVAL_MAX_RESULTS.key() + "' value must be between 1 and 100"));
Expand Down