diff --git a/README.md b/README.md index 6d70264..161382f 100644 --- a/README.md +++ b/README.md @@ -486,9 +486,9 @@ Closes #123" - 5、支持将智能问数项目对外提供OpenAPI的服务; - 6、支持将智能问数项目对外提供MCP的服务; - 7、支持seed命令可以将CSV文件初始化加载入数据库; -- 8、基于LLM的数据探查辅助生成语义模型;(TODO) -- 9、数据模型、语义模型、智能问数的单元测试;(TODO) -- 10、SQL问答对、同义词、业务知识等向量化入库与检索;(TODO) +- 8、SQL问答对、同义词、业务知识等向量化入库与检索; +- 9、基于LLM的数据探查辅助生成语义模型;(TODO) +- 10、数据模型、语义模型、智能问数的单元测试;(TODO) - 11、指标的配置(构建语义模型后可以更进一步添加指标);(TODO) diff --git a/dat-cli/src/main/resources/project_init_template/models/examples/knowledge_examples1.yaml b/dat-cli/src/main/resources/project_init_template/models/examples/knowledge_examples1.yaml new file mode 100644 index 0000000..b57e5d9 --- /dev/null +++ b/dat-cli/src/main/resources/project_init_template/models/examples/knowledge_examples1.yaml @@ -0,0 +1,8 @@ +examples: + knowledge: + - | + COVID-19(新型冠状病毒肺炎)是由SARS-CoV-2病毒引起的呼吸道传染病。 + 主要传播途径为飞沫和密切接触,在相对封闭的环境中可经气溶胶传播。 + 典型症状包括发热、干咳、乏力等,部分患者可能丧失嗅觉或味觉。值得注意的是,许多感染者症状轻微甚至无症状,但仍具有传染性。 + 预防措施至关重要:接种疫苗、在人员密集场所佩戴口罩、勤洗手、多通风。若出现症状,应及时进行检测并自我隔离,保护他人。 + 虽然大多数患者可康复,但对老年人和有基础疾病者威胁较大,仍需保持警惕。 \ No newline at end of file diff --git a/dat-cli/src/main/resources/project_init_template/models/examples/knowledge_examples2.yaml b/dat-cli/src/main/resources/project_init_template/models/examples/knowledge_examples2.yaml new file mode 100644 index 0000000..0ce44f4 --- /dev/null +++ b/dat-cli/src/main/resources/project_init_template/models/examples/knowledge_examples2.yaml @@ -0,0 +1,8 @@ +examples: + knowledge: + - | + COVID 预防是关键: + 接种疫苗:有效降低重症和死亡风险。 + 良好卫生:勤洗手、戴口罩、常通风。 + 保持距离:避免人群聚集。 + 虽然目前毒株毒性减弱,但基础防护对保护高危人群(如老年人、有基础疾病者)仍至关重要。若出现症状请及时就医。 \ No newline at end of file diff --git a/dat-cli/src/main/resources/project_init_template/models/examples/sql_examples.yaml b/dat-cli/src/main/resources/project_init_template/models/examples/sql_examples.yaml new file mode 100644 index 0000000..77473bf --- /dev/null +++ b/dat-cli/src/main/resources/project_init_template/models/examples/sql_examples.yaml @@ -0,0 +1,12 @@ +examples: + sql_pairs: + - question: 各个国家的平均病例数 + sql: | + SELECT country_covid_cases.country, AVG(country_covid_cases.cases_total) AS average_cases + FROM country_covid_cases + GROUP BY country_covid_cases.country + - question: 各个国家的平均病例数 + sql: | + SELECT country_covid_cases.country, AVG(country_covid_cases.cases_total) AS average_cases + FROM country_covid_cases + GROUP BY country_covid_cases.country \ No newline at end of file diff --git a/dat-cli/src/main/resources/project_init_template/models/examples/synonyms_examples.yaml b/dat-cli/src/main/resources/project_init_template/models/examples/synonyms_examples.yaml new file mode 100644 index 0000000..f534b76 --- /dev/null +++ b/dat-cli/src/main/resources/project_init_template/models/examples/synonyms_examples.yaml @@ -0,0 +1,15 @@ +examples: + synonyms_pairs: + - word: COVID-19 + synonyms: + - 新型冠状病毒肺炎 + - 2019冠状病毒病 + - 新冠 + - 新冠肺炎 + - 冠状病毒/新冠 + - word: UnitedStates + synonyms: + - USA + - 美国 + - 美利坚合众国 + - The United States of America \ No newline at end of file diff --git a/dat-core/src/main/java/ai/dat/core/contentstore/ContentStore.java b/dat-core/src/main/java/ai/dat/core/contentstore/ContentStore.java index fd4488e..4e5caa0 100644 --- a/dat-core/src/main/java/ai/dat/core/contentstore/ContentStore.java +++ b/dat-core/src/main/java/ai/dat/core/contentstore/ContentStore.java @@ -62,7 +62,7 @@ default void removeSql(String id) { void removeAllSqls(); - // ---------------名词和同义词对------------------- + // ---------------词和同义词对------------------- default String addSyn(WordSynonymPair synonymPair) { return addSyns(List.of(synonymPair)).get(0); diff --git a/dat-core/src/main/java/ai/dat/core/contentstore/data/WordSynonymPair.java b/dat-core/src/main/java/ai/dat/core/contentstore/data/WordSynonymPair.java index a46af2f..7868fc9 100644 --- a/dat-core/src/main/java/ai/dat/core/contentstore/data/WordSynonymPair.java +++ b/dat-core/src/main/java/ai/dat/core/contentstore/data/WordSynonymPair.java @@ -20,47 +20,28 @@ @JsonInclude(JsonInclude.Include.NON_NULL) public class WordSynonymPair { @NonNull - private final String noun; + private final String word; @NonNull private final List synonyms; - private String description; - @JsonCreator - private WordSynonymPair(@JsonProperty("noun") @NonNull String noun, + private WordSynonymPair(@JsonProperty("word") @NonNull String word, @JsonProperty("synonyms") @NonNull List synonyms) { - this.noun = noun; - this.synonyms = synonyms; - } - - @JsonCreator - private WordSynonymPair(@JsonProperty("noun") @NonNull String noun, - @JsonProperty("synonyms") @NonNull List synonyms, - @JsonProperty("description") String description) { - this.noun = noun; + this.word = word; this.synonyms = synonyms; - this.description = description; - } - - public static WordSynonymPair from(@NonNull String noun, @NonNull String synonym, String description) { - return new WordSynonymPair(noun, Collections.singletonList(synonym), description); - } - - public static WordSynonymPair from(@NonNull String noun, @NonNull List synonyms, String description) { - return new WordSynonymPair(noun, synonyms, description); } - public static WordSynonymPair from(@NonNull String noun, @NonNull String synonym) { - return new WordSynonymPair(noun, Collections.singletonList(synonym)); + public static WordSynonymPair from(@NonNull String word, @NonNull String synonym) { + return new WordSynonymPair(word, Collections.singletonList(synonym)); } - public static WordSynonymPair from(@NonNull String noun, @NonNull List synonyms) { - return new WordSynonymPair(noun, synonyms); + public static WordSynonymPair from(@NonNull String word, @NonNull List synonyms) { + return new WordSynonymPair(word, synonyms); } - public static WordSynonymPair from(@NonNull String noun, @NonNull String... synonyms) { - return new WordSynonymPair(noun, Arrays.asList(synonyms)); + public static WordSynonymPair from(@NonNull String word, @NonNull String... synonyms) { + return new WordSynonymPair(word, Arrays.asList(synonyms)); } public void add(@NonNull String synonym) { diff --git a/dat-core/src/main/resources/prompts/default/intent_classification_user_prompt_template.txt b/dat-core/src/main/resources/prompts/default/intent_classification_user_prompt_template.txt index bd94a9f..8f83c5f 100644 --- a/dat-core/src/main/resources/prompts/default/intent_classification_user_prompt_template.txt +++ b/dat-core/src/main/resources/prompts/default/intent_classification_user_prompt_template.txt @@ -12,9 +12,9 @@ SQL: {{ item.sql }} {% endif %} {% if synonyms %} -### NOUN AND SYNONYMS ### +### WORD AND SYNONYMS ### {% for item in synonyms %} -Noun: {{ item.noun }} Synonyms: {{ item.synonyms|join(', ') }} +Word: {{ item.noun }} Synonyms: {{ item.synonyms|join(', ') }} {% endfor %} {% endif %} diff --git a/dat-core/src/main/resources/prompts/default/sql_generation_reasoning_user_prompt_template.txt b/dat-core/src/main/resources/prompts/default/sql_generation_reasoning_user_prompt_template.txt index 5e3d499..99b1d56 100644 --- a/dat-core/src/main/resources/prompts/default/sql_generation_reasoning_user_prompt_template.txt +++ b/dat-core/src/main/resources/prompts/default/sql_generation_reasoning_user_prompt_template.txt @@ -12,9 +12,9 @@ SQL: {{ item.sql }} {% endif %} {% if synonyms %} -### NOUN AND SYNONYMS ### +### WORD AND SYNONYMS ### {% for item in synonyms %} -Noun: {{ item.noun }} Synonyms: {{ item.synonyms|join(', ') }} +Word: {{ item.noun }} Synonyms: {{ item.synonyms|join(', ') }} {% endfor %} {% endif %} diff --git a/dat-core/src/main/resources/prompts/default/sql_generation_reasoning_with_followup_user_prompt_template.txt b/dat-core/src/main/resources/prompts/default/sql_generation_reasoning_with_followup_user_prompt_template.txt index 8bc6d49..4ce7f26 100644 --- a/dat-core/src/main/resources/prompts/default/sql_generation_reasoning_with_followup_user_prompt_template.txt +++ b/dat-core/src/main/resources/prompts/default/sql_generation_reasoning_with_followup_user_prompt_template.txt @@ -12,9 +12,9 @@ SQL: {{ item.sql }} {% endif %} {% if synonyms %} -### NOUN AND SYNONYMS ### +### WORD AND SYNONYMS ### {% for item in synonyms %} -Noun: {{ item.noun }} Synonyms: {{ item.synonyms|join(', ') }} +Word: {{ item.noun }} Synonyms: {{ item.synonyms|join(', ') }} {% endfor %} {% endif %} diff --git a/dat-core/src/main/resources/prompts/default/sql_generation_user_prompt_template.txt b/dat-core/src/main/resources/prompts/default/sql_generation_user_prompt_template.txt index 5ac81a3..2645fad 100644 --- a/dat-core/src/main/resources/prompts/default/sql_generation_user_prompt_template.txt +++ b/dat-core/src/main/resources/prompts/default/sql_generation_user_prompt_template.txt @@ -12,9 +12,9 @@ SQL: {{ item.sql }} {% endif %} {% if synonyms %} -### NOUN AND SYNONYMS ### +### WORD AND SYNONYMS ### {% for item in synonyms %} -Noun: {{ item.noun }} Synonyms: {{ item.synonyms|join(', ') }} +Word: {{ item.noun }} Synonyms: {{ item.synonyms|join(', ') }} {% endfor %} {% endif %} diff --git a/dat-core/src/main/resources/prompts/default/sql_generation_with_followup_user_prompt_template.txt b/dat-core/src/main/resources/prompts/default/sql_generation_with_followup_user_prompt_template.txt index 2169cbf..f60d462 100644 --- a/dat-core/src/main/resources/prompts/default/sql_generation_with_followup_user_prompt_template.txt +++ b/dat-core/src/main/resources/prompts/default/sql_generation_with_followup_user_prompt_template.txt @@ -18,9 +18,9 @@ SQL: {% endif %} {% if synonyms %} -### NOUN AND SYNONYMS ### +### WORD AND SYNONYMS ### {% for item in synonyms %} -Noun: {{ item.noun }} Synonyms: {{ item.synonyms|join(', ') }} +Word: {{ item.noun }} Synonyms: {{ item.synonyms|join(', ') }} {% endfor %} {% endif %} diff --git a/dat-sdk/src/main/java/ai/dat/boot/BuildStateManager.java b/dat-sdk/src/main/java/ai/dat/boot/BuildStateManager.java index a748597..b9e7f6d 100644 --- a/dat-sdk/src/main/java/ai/dat/boot/BuildStateManager.java +++ b/dat-sdk/src/main/java/ai/dat/boot/BuildStateManager.java @@ -23,7 +23,7 @@ * @Date 2025/7/17 */ @Slf4j -public class BuildStateManager { +class BuildStateManager { private static final String STATE_FILE_PREFIX = "build_state_"; private static final String STATE_FILE_SUFFIX = ".json"; diff --git a/dat-sdk/src/main/java/ai/dat/boot/ChangeKnowledgeCacheUtil.java b/dat-sdk/src/main/java/ai/dat/boot/ChangeKnowledgeCacheUtil.java new file mode 100644 index 0000000..582d2cf --- /dev/null +++ b/dat-sdk/src/main/java/ai/dat/boot/ChangeKnowledgeCacheUtil.java @@ -0,0 +1,54 @@ +package ai.dat.boot; + +import lombok.NonNull; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * 变化的业务知识缓存工具 + * + * @Author JunjieM + * @Date 2025/9/26 + */ +class ChangeKnowledgeCacheUtil { + + private ChangeKnowledgeCacheUtil() { + } + + // Map>> + private final static Map>> CACHE = new HashMap<>(); + + public static void add(@NonNull String projectId, + @NonNull String relativePath, + @NonNull List knowledge) { + Map> listMap = new HashMap<>(); + if (CACHE.containsKey(projectId)) { + listMap = CACHE.get(projectId); + } + listMap.put(relativePath, knowledge); + CACHE.put(projectId, listMap); + } + + public static List get(@NonNull String projectId, + @NonNull String relativePath) { + Map> listMap = new HashMap<>(); + if (CACHE.containsKey(projectId)) { + listMap = CACHE.get(projectId); + } + return listMap.get(relativePath); + } + + public static Map> get(String projectId) { + if (CACHE.containsKey(projectId)) { + return CACHE.get(projectId); + } + return Collections.emptyMap(); + } + + public static Map> remove(String projectId) { + return CACHE.remove(projectId); + } +} diff --git a/dat-sdk/src/main/java/ai/dat/boot/ChangeQuestionSqlPairsCacheUtil.java b/dat-sdk/src/main/java/ai/dat/boot/ChangeQuestionSqlPairsCacheUtil.java new file mode 100644 index 0000000..8a788f4 --- /dev/null +++ b/dat-sdk/src/main/java/ai/dat/boot/ChangeQuestionSqlPairsCacheUtil.java @@ -0,0 +1,55 @@ +package ai.dat.boot; + +import ai.dat.core.contentstore.data.QuestionSqlPair; +import lombok.NonNull; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * 变化的问答SQL对缓存工具 + * + * @Author JunjieM + * @Date 2025/9/26 + */ +class ChangeQuestionSqlPairsCacheUtil { + + private ChangeQuestionSqlPairsCacheUtil() { + } + + // Map>> + private final static Map>> CACHE = new HashMap<>(); + + public static void add(@NonNull String projectId, + @NonNull String relativePath, + @NonNull List questionSqlPairs) { + Map> listMap = new HashMap<>(); + if (CACHE.containsKey(projectId)) { + listMap = CACHE.get(projectId); + } + listMap.put(relativePath, questionSqlPairs); + CACHE.put(projectId, listMap); + } + + public static List get(@NonNull String projectId, + @NonNull String relativePath) { + Map> listMap = new HashMap<>(); + if (CACHE.containsKey(projectId)) { + listMap = CACHE.get(projectId); + } + return listMap.get(relativePath); + } + + public static Map> get(String projectId) { + if (CACHE.containsKey(projectId)) { + return CACHE.get(projectId); + } + return Collections.emptyMap(); + } + + public static Map> remove(String projectId) { + return CACHE.remove(projectId); + } +} diff --git a/dat-sdk/src/main/java/ai/dat/boot/ChangeSemanticModelsCacheUtil.java b/dat-sdk/src/main/java/ai/dat/boot/ChangeSemanticModelsCacheUtil.java index 1097eee..3c227f3 100644 --- a/dat-sdk/src/main/java/ai/dat/boot/ChangeSemanticModelsCacheUtil.java +++ b/dat-sdk/src/main/java/ai/dat/boot/ChangeSemanticModelsCacheUtil.java @@ -19,6 +19,7 @@ class ChangeSemanticModelsCacheUtil { private ChangeSemanticModelsCacheUtil() { } + // Map>> private final static Map>> CACHE = new HashMap<>(); public static void add(@NonNull String projectId, diff --git a/dat-sdk/src/main/java/ai/dat/boot/ChangeWordSynonymPairsCacheUtil.java b/dat-sdk/src/main/java/ai/dat/boot/ChangeWordSynonymPairsCacheUtil.java new file mode 100644 index 0000000..3af2ca1 --- /dev/null +++ b/dat-sdk/src/main/java/ai/dat/boot/ChangeWordSynonymPairsCacheUtil.java @@ -0,0 +1,55 @@ +package ai.dat.boot; + +import ai.dat.core.contentstore.data.WordSynonymPair; +import lombok.NonNull; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * 变化的同义词对缓存工具 + * + * @Author JunjieM + * @Date 2025/9/26 + */ +class ChangeWordSynonymPairsCacheUtil { + + private ChangeWordSynonymPairsCacheUtil() { + } + + // Map>> + private final static Map>> CACHE = new HashMap<>(); + + public static void add(@NonNull String projectId, + @NonNull String relativePath, + @NonNull List wordSynonymPairs) { + Map> listMap = new HashMap<>(); + if (CACHE.containsKey(projectId)) { + listMap = CACHE.get(projectId); + } + listMap.put(relativePath, wordSynonymPairs); + CACHE.put(projectId, listMap); + } + + public static List get(@NonNull String projectId, + @NonNull String relativePath) { + Map> listMap = new HashMap<>(); + if (CACHE.containsKey(projectId)) { + listMap = CACHE.get(projectId); + } + return listMap.get(relativePath); + } + + public static Map> get(String projectId) { + if (CACHE.containsKey(projectId)) { + return CACHE.get(projectId); + } + return Collections.emptyMap(); + } + + public static Map> remove(String projectId) { + return CACHE.remove(projectId); + } +} diff --git a/dat-sdk/src/main/java/ai/dat/boot/ContentStoreManager.java b/dat-sdk/src/main/java/ai/dat/boot/ContentStoreManager.java index 1b7a6d0..9be146d 100644 --- a/dat-sdk/src/main/java/ai/dat/boot/ContentStoreManager.java +++ b/dat-sdk/src/main/java/ai/dat/boot/ContentStoreManager.java @@ -5,7 +5,6 @@ import ai.dat.boot.utils.ProjectUtil; import ai.dat.core.contentstore.ContentStore; import ai.dat.core.data.project.DatProject; -import ai.dat.core.semantic.data.SemanticModel; import lombok.NonNull; import lombok.extern.slf4j.Slf4j; @@ -14,6 +13,8 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Optional; +import java.util.function.Predicate; import java.util.stream.Collectors; /** @@ -23,7 +24,7 @@ * @Date 2025/7/17 */ @Slf4j -public class ContentStoreManager { +class ContentStoreManager { private final DatProject project; @@ -40,75 +41,69 @@ public ContentStoreManager(DatProject project, Path projectPath, String stateId) public void updateStore(@NonNull List fileStates, @NonNull FileChanges changes) throws IOException { - Map fileStateMap = fileStates.stream() + Map oldFileStates = fileStates.stream() .collect(Collectors.toMap(SchemaFileState::getRelativePath, f -> f)); // 未变化的文件 List newFileStates = new ArrayList<>(changes.unchangedFiles()); // 处理删除的文件 - for (SchemaFileState fs : changes.deletedFiles()) { - String relativePath = fs.getRelativePath(); - if (!fileStateMap.containsKey(relativePath)) { - continue; - } - SchemaFileState fileState = fileStateMap.get(relativePath); - List oldVectorIds = fileState.getVectorIds(); - if (oldVectorIds != null && !oldVectorIds.isEmpty()) { - try { - contentStore.removeMdls(oldVectorIds); - } catch (Exception e) { - log.warn("Failed to delete the state: " + relativePath); - newFileStates.add(fileState); - } - } - } + changes.deletedFiles().forEach(fs -> remove(oldFileStates, fs)); // 处理新增的文件 - for (SchemaFileState fs : changes.newFiles()) { - List semanticModels = ChangeSemanticModelsCacheUtil - .get(project.getName(), fs.getRelativePath()); - if (semanticModels != null && !semanticModels.isEmpty()) { - List vectorIds = contentStore.addMdls(semanticModels); - newFileStates.add(SchemaFileState.builder() - .relativePath(fs.getRelativePath()) - .lastModified(fs.getLastModified()) - .md5Hash(fs.getMd5Hash()) - .semanticModelNames(fs.getSemanticModelNames()) - .vectorIds(vectorIds) - .dependencies(fs.getDependencies()) - .build()); - } - } + changes.newFiles().forEach(fs -> add(newFileStates, fs)); // 处理修改的文件 - for (SchemaFileState fs : changes.modifiedFiles()) { - String relativePath = fs.getRelativePath(); - if (!fileStateMap.containsKey(relativePath)) { - continue; - } - SchemaFileState fileState = fileStateMap.get(relativePath); - List oldVectorIds = fileState.getVectorIds(); - if (oldVectorIds != null && !oldVectorIds.isEmpty()) { - try { - contentStore.removeMdls(oldVectorIds); - } catch (Exception e) { - log.warn("Failed to delete the state: " + relativePath); - newFileStates.add(fileState); - continue; - } - } - List semanticModels = ChangeSemanticModelsCacheUtil - .get(project.getName(), fs.getRelativePath()); - if (semanticModels != null && !semanticModels.isEmpty()) { - List vectorIds = contentStore.addMdls(semanticModels); - newFileStates.add(SchemaFileState.builder() - .relativePath(fs.getRelativePath()) - .lastModified(fs.getLastModified()) - .md5Hash(fs.getMd5Hash()) - .semanticModelNames(fs.getSemanticModelNames()) - .vectorIds(vectorIds) - .dependencies(fs.getDependencies()) - .build()); - } - } + changes.modifiedFiles().forEach(fs -> { + remove(oldFileStates, fs); + add(newFileStates, fs); + }); + // 保存状态 stateManager.saveBuildState(stateId, newFileStates); } + private void add(List newFileStates, SchemaFileState fileState) { + String projectId = project.getName(); + String relativePath = fileState.getRelativePath(); + SchemaFileState.SchemaFileStateBuilder builder = SchemaFileState.builder() + .relativePath(fileState.getRelativePath()) + .lastModified(fileState.getLastModified()) + .md5Hash(fileState.getMd5Hash()) + .semanticModelNames(fileState.getSemanticModelNames()) + .modelFileStates(fileState.getModelFileStates()); + Optional.ofNullable(ChangeSemanticModelsCacheUtil.get(projectId, relativePath)) + .filter(Predicate.not(List::isEmpty)) + .map(contentStore::addMdls) + .ifPresent(builder::semanticModelVectorIds); + Optional.ofNullable(ChangeQuestionSqlPairsCacheUtil.get(projectId, relativePath)) + .filter(Predicate.not(List::isEmpty)) + .map(contentStore::addSqls) + .ifPresent(builder::questionSqlPairVectorIds); + Optional.ofNullable(ChangeWordSynonymPairsCacheUtil.get(projectId, relativePath)) + .filter(Predicate.not(List::isEmpty)) + .map(contentStore::addSyns) + .ifPresent(builder::wordSynonymPairVectorIds); + Optional.ofNullable(ChangeKnowledgeCacheUtil.get(projectId, relativePath)) + .filter(Predicate.not(List::isEmpty)) + .map(contentStore::addDocs) + .ifPresent(builder::knowledgeVectorIds); + newFileStates.add(builder.build()); + } + + private void remove(Map oldFileStates, SchemaFileState fileState) { + String relativePath = fileState.getRelativePath(); + if (!oldFileStates.containsKey(relativePath)) { + return; + } + SchemaFileState oldFileState = oldFileStates.get(relativePath); + Optional.ofNullable(oldFileState.getSemanticModelVectorIds()) + .filter(Predicate.not(List::isEmpty)) + .ifPresent(contentStore::removeMdls); + Optional.ofNullable(oldFileState.getQuestionSqlPairVectorIds()) + .filter(Predicate.not(List::isEmpty)) + .ifPresent(contentStore::removeSqls); + Optional.ofNullable(oldFileState.getWordSynonymPairVectorIds()) + .filter(Predicate.not(List::isEmpty)) + .ifPresent(contentStore::removeSyns); + Optional.ofNullable(oldFileState.getKnowledgeVectorIds()) + .filter(Predicate.not(List::isEmpty)) + .ifPresent(contentStore::removeDocs); + } + } \ No newline at end of file diff --git a/dat-sdk/src/main/java/ai/dat/boot/FileChangeAnalyzer.java b/dat-sdk/src/main/java/ai/dat/boot/FileChangeAnalyzer.java index bdb440e..43828b2 100644 --- a/dat-sdk/src/main/java/ai/dat/boot/FileChangeAnalyzer.java +++ b/dat-sdk/src/main/java/ai/dat/boot/FileChangeAnalyzer.java @@ -1,12 +1,13 @@ package ai.dat.boot; import ai.dat.boot.data.FileChanges; -import ai.dat.boot.data.ModelFileState; +import ai.dat.boot.data.RelevantFileState; import ai.dat.boot.data.SchemaFileState; import ai.dat.boot.utils.FileUtil; import ai.dat.boot.utils.ProjectUtil; import ai.dat.core.data.DatModel; import ai.dat.core.data.DatSchema; +import ai.dat.core.data.example.Example; import ai.dat.core.data.project.DatProject; import ai.dat.core.exception.ValidationException; import ai.dat.core.semantic.data.SemanticModel; @@ -31,7 +32,7 @@ * @Date 2025/7/17 */ @Slf4j -public class FileChangeAnalyzer { +class FileChangeAnalyzer { private final Path modelsPath; @@ -67,25 +68,15 @@ public FileChanges analyzeChanges(List fileStates) { String relativePath = modelsPath.relativize(filePath).toString(); SchemaFileState fileState = fileStateMap.get(relativePath); if (fileState == null) { - // 新文件 + // 新YAML文件 long lastModified = FileUtil.lastModified(filePath); String md5Hash = FileUtil.md5(filePath); DatSchema schema = ProjectUtil.loadSchema(filePath, modelsPath); - List dependencies = resolveDependencies(relativePath, schema); - ChangeSemanticModelsCacheUtil.add(project.getName(), relativePath, - DatSchemaUtil.getSemanticModels(schema, getDatModels(dependencies))); - List semanticModelNames = schema.getSemanticModels().stream() - .map(SemanticModel::getName) - .collect(Collectors.toList()); - newFiles.add(SchemaFileState.builder() - .relativePath(relativePath) - .lastModified(lastModified) - .md5Hash(md5Hash) - .semanticModelNames(semanticModelNames) - .dependencies(dependencies) - .build()); + List modelFileStates = resolveModelFileStates(relativePath, schema); + newFiles.add(createSchemaFileState( + relativePath, lastModified, md5Hash, schema, modelFileStates)); } else { - // 已存在的文件,检查是否发生变化 + // 已存在的YAML文件,检查是否发生变化 boolean hasChanged = false; String md5Hash = null; long lastModified = FileUtil.lastModified(filePath); @@ -93,31 +84,21 @@ public FileChanges analyzeChanges(List fileStates) { md5Hash = FileUtil.md5(filePath); hasChanged = !md5Hash.equals(fileState.getMd5Hash()); } - List dependencies = Collections.emptyList(); + List modelFileStates = Collections.emptyList(); DatSchema schema = null; - if (hasChanged || !fileState.getDependencies().isEmpty()) { + if (hasChanged || !fileState.getModelFileStates().isEmpty()) { schema = ProjectUtil.loadSchema(filePath, modelsPath); } - if (!fileState.getDependencies().isEmpty()) { - dependencies = resolveDependencies(relativePath, schema); - hasChanged = hasDependencyChanged(dependencies, fileState.getDependencies()); + if (!fileState.getModelFileStates().isEmpty()) { + modelFileStates = resolveModelFileStates(relativePath, schema); + hasChanged = hasModelFileChanged(modelFileStates, fileState.getModelFileStates()); } if (hasChanged) { - // 文件已修改 - ChangeSemanticModelsCacheUtil.add(project.getName(), relativePath, - DatSchemaUtil.getSemanticModels(schema, getDatModels(dependencies))); - List semanticModelNames = schema.getSemanticModels().stream() - .map(SemanticModel::getName) - .collect(Collectors.toList()); - modifiedFiles.add(SchemaFileState.builder() - .relativePath(relativePath) - .lastModified(lastModified) - .md5Hash(md5Hash) - .semanticModelNames(semanticModelNames) - .dependencies(dependencies) - .build()); + // YAML文件已修改 + modifiedFiles.add(createSchemaFileState( + relativePath, lastModified, md5Hash, schema, modelFileStates)); } else { - // 文件未变化,保留之前的元数据 + // YAML文件未变化,保留之前的元数据 unchangedFiles.add(fileState); } } @@ -126,7 +107,7 @@ public FileChanges analyzeChanges(List fileStates) { // 检查语义模型名称是否有重复 validateSemanticModelNames(newFiles, modifiedFiles, unchangedFiles); - // 查找已删除的文件 - 直接内联处理逻辑 + // 查找已删除的YAML文件 - 直接内联处理逻辑 List relativePaths = yamlFilePaths.stream() .map(p -> modelsPath.relativize(p).toString()).toList(); List deletedFiles = fileStates.stream() @@ -136,6 +117,31 @@ public FileChanges analyzeChanges(List fileStates) { return new FileChanges(newFiles, modifiedFiles, unchangedFiles, deletedFiles); } + private SchemaFileState createSchemaFileState(String relativePath, long lastModified, String md5Hash, + DatSchema schema, List modelFileStates) { + ChangeSemanticModelsCacheUtil.add(project.getName(), relativePath, + DatSchemaUtil.getSemanticModels(schema, getDatModels(modelFileStates))); + Example example = schema.getExample(); + if (example != null) { + ChangeQuestionSqlPairsCacheUtil.add(project.getName(), relativePath, + example.getQuestionSqlPairs()); + ChangeWordSynonymPairsCacheUtil.add(project.getName(), relativePath, + example.getWordSynonymPairs()); + ChangeKnowledgeCacheUtil.add(project.getName(), relativePath, + example.getKnowledge()); + } + List semanticModelNames = schema.getSemanticModels().stream() + .map(SemanticModel::getName) + .collect(Collectors.toList()); + return SchemaFileState.builder() + .relativePath(relativePath) + .lastModified(lastModified) + .md5Hash(md5Hash) + .semanticModelNames(semanticModelNames) + .modelFileStates(modelFileStates) + .build(); + } + private void validateSemanticModelNames(List newFiles, List modifiedFiles, List unchangedFiles) { @@ -163,7 +169,7 @@ private void validateSemanticModelNames(List newFiles, } } - private List resolveDependencies(String relativePath, DatSchema schema) { + private List resolveModelFileStates(String relativePath, DatSchema schema) { return DatSchemaUtil.getModelName(schema).stream() .map(modelName -> { try { @@ -174,17 +180,17 @@ private List resolveDependencies(String relativePath, DatSchema }).collect(Collectors.toList()); } - private List getDatModels(List dependencies) { - if (dependencies == null || dependencies.isEmpty()) { + private List getDatModels(List modelFileStates) { + if (modelFileStates == null || modelFileStates.isEmpty()) { return Collections.emptyList(); } - return dependencies.stream() + return modelFileStates.stream() .filter(f -> sqlFileRelativePaths.contains(f.getRelativePath())) .map(f -> ProjectUtil.loadModel(modelsPath.resolve(f.getRelativePath()), modelsPath)) .collect(Collectors.toList()); } - private ModelFileState getModelFileMetadata(String relativePath, String modelName) throws IOException { + private RelevantFileState getModelFileMetadata(String relativePath, String modelName) throws IOException { List modelFiles = sqlFilePaths.stream() .filter(p -> FileUtil.fileNameWithoutSuffix(p.getFileName().toString()).equals(modelName)) .toList(); @@ -204,11 +210,11 @@ private ModelFileState getModelFileMetadata(String relativePath, String modelNam String modelRelativePath = modelsPath.relativize(modelFilePath).toString(); long lastModified = FileUtil.lastModified(modelFilePath); String md5Hash = FileUtil.md5(modelFilePath); - return new ModelFileState(modelRelativePath, lastModified, md5Hash); + return new RelevantFileState(modelRelativePath, lastModified, md5Hash); } - private boolean hasDependencyChanged(List currentDeps, - List previousDeps) { + private boolean hasModelFileChanged(List currentDeps, + List previousDeps) { if (previousDeps == null || previousDeps.isEmpty()) { return !currentDeps.isEmpty(); // 之前没有依赖,现在有依赖 } @@ -219,15 +225,15 @@ private boolean hasDependencyChanged(List currentDeps, if (currentDeps.size() != previousDeps.size()) { return true; } - Map currentMap = currentDeps.stream() - .collect(Collectors.toMap(ModelFileState::getRelativePath, d -> d)); - Map previousMap = previousDeps.stream() - .collect(Collectors.toMap(ModelFileState::getRelativePath, d -> d)); + Map currentMap = currentDeps.stream() + .collect(Collectors.toMap(RelevantFileState::getRelativePath, d -> d)); + Map previousMap = previousDeps.stream() + .collect(Collectors.toMap(RelevantFileState::getRelativePath, d -> d)); // 检查每个依赖文件是否变化 - for (Map.Entry entry : currentMap.entrySet()) { + for (Map.Entry entry : currentMap.entrySet()) { String relativePath = entry.getKey(); - ModelFileState current = entry.getValue(); - ModelFileState previous = previousMap.get(relativePath); + RelevantFileState current = entry.getValue(); + RelevantFileState previous = previousMap.get(relativePath); if (previous == null) { return true; } diff --git a/dat-sdk/src/main/java/ai/dat/boot/PreBuildValidator.java b/dat-sdk/src/main/java/ai/dat/boot/PreBuildValidator.java index 267de4f..83935a5 100644 --- a/dat-sdk/src/main/java/ai/dat/boot/PreBuildValidator.java +++ b/dat-sdk/src/main/java/ai/dat/boot/PreBuildValidator.java @@ -35,10 +35,10 @@ * @Author JunjieM * @Date 2025/8/7 */ -public class PreBuildValidator { +class PreBuildValidator { private final DatProject project; - private final Path projectPath; + private final Path projectPath; public PreBuildValidator(DatProject project, Path projectPath) { this.project = project; diff --git a/dat-sdk/src/main/java/ai/dat/boot/data/ModelFileState.java b/dat-sdk/src/main/java/ai/dat/boot/data/RelevantFileState.java similarity index 94% rename from dat-sdk/src/main/java/ai/dat/boot/data/ModelFileState.java rename to dat-sdk/src/main/java/ai/dat/boot/data/RelevantFileState.java index 81430a1..ac43f02 100644 --- a/dat-sdk/src/main/java/ai/dat/boot/data/ModelFileState.java +++ b/dat-sdk/src/main/java/ai/dat/boot/data/RelevantFileState.java @@ -13,7 +13,7 @@ @NoArgsConstructor @AllArgsConstructor @JsonInclude(JsonInclude.Include.NON_NULL) -public class ModelFileState { +public class RelevantFileState { /** * 文件相对路径 */ diff --git a/dat-sdk/src/main/java/ai/dat/boot/data/SchemaFileState.java b/dat-sdk/src/main/java/ai/dat/boot/data/SchemaFileState.java index 0b34d1a..76d9746 100644 --- a/dat-sdk/src/main/java/ai/dat/boot/data/SchemaFileState.java +++ b/dat-sdk/src/main/java/ai/dat/boot/data/SchemaFileState.java @@ -1,6 +1,7 @@ package ai.dat.boot.data; import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; @@ -9,8 +10,6 @@ import java.util.List; /** - * 文件元数据,记录文件的构建状态信息 - * * @Author JunjieM * @Date 2025/7/16 */ @@ -38,15 +37,39 @@ public class SchemaFileState { /** * 语义模型名称列表 */ - private List semanticModelNames; + private List semanticModelNames = List.of(); /** - * 向量存储ID列表 + * 语义模型的向量存储ID列表 + *

+ * 由于之前参数名为vectorIds,为了考虑兼容性这里设置@JsonProperty("vectorIds") */ - private List vectorIds; + @JsonProperty("vectorIds") + private List semanticModelVectorIds = List.of(); /** - * 依赖的模型文件信息 + * 模型的文件信息列表 + *

+ * 由于之前参数名为dependencies,为了考虑兼容性这里设置@JsonProperty("dependencies") */ - private List dependencies; + @JsonProperty("dependencies") + private List modelFileStates = List.of(); + + /** + * 问答SQL对的向量存储ID列表 + */ + @JsonProperty("sqlPairVectorIds") + private List questionSqlPairVectorIds = List.of(); + + /** + * 同义词对的向量存储ID列表 + */ + @JsonProperty("synonymPairVectorIds") + private List wordSynonymPairVectorIds = List.of(); + + /** + * 业务知识的向量存储ID列表 + */ + @JsonProperty("knowledgeVectorIds") + private List knowledgeVectorIds = List.of(); } \ No newline at end of file diff --git a/dat-sdk/src/main/java/ai/dat/core/data/DatSchema.java b/dat-sdk/src/main/java/ai/dat/core/data/DatSchema.java index 22ec848..90e3a40 100644 --- a/dat-sdk/src/main/java/ai/dat/core/data/DatSchema.java +++ b/dat-sdk/src/main/java/ai/dat/core/data/DatSchema.java @@ -1,5 +1,6 @@ package ai.dat.core.data; +import ai.dat.core.data.example.Example; import ai.dat.core.data.seed.SeedSpec; import ai.dat.core.semantic.data.SemanticModel; import com.fasterxml.jackson.annotation.JsonProperty; @@ -22,4 +23,7 @@ public class DatSchema { @NonNull @JsonProperty("seeds") private List seeds = List.of(); + + @JsonProperty("examples") + private Example example; } \ No newline at end of file diff --git a/dat-sdk/src/main/java/ai/dat/core/data/example/Example.java b/dat-sdk/src/main/java/ai/dat/core/data/example/Example.java new file mode 100644 index 0000000..48b4bf5 --- /dev/null +++ b/dat-sdk/src/main/java/ai/dat/core/data/example/Example.java @@ -0,0 +1,30 @@ +package ai.dat.core.data.example; + +import ai.dat.core.contentstore.data.QuestionSqlPair; +import ai.dat.core.contentstore.data.WordSynonymPair; +import com.fasterxml.jackson.annotation.JsonProperty; +import lombok.Getter; +import lombok.NonNull; +import lombok.Setter; + +import java.util.List; + +/** + * @Author JunjieM + * @Date 2025/9/26 + */ +@Setter +@Getter +public class Example { + @NonNull + @JsonProperty("sql_pairs") + private List questionSqlPairs = List.of(); + + @NonNull + @JsonProperty("synonyms_pairs") + private List wordSynonymPairs = List.of(); + + @NonNull + @JsonProperty("knowledge") + private List knowledge = List.of(); +} diff --git a/dat-sdk/src/main/resources/schemas/schema.json b/dat-sdk/src/main/resources/schemas/schema.json index 11234ea..4758ed3 100644 --- a/dat-sdk/src/main/resources/schemas/schema.json +++ b/dat-sdk/src/main/resources/schemas/schema.json @@ -440,6 +440,83 @@ } } } + }, + "examples": { + "type": "object", + "properties": { + "sql_pairs": { + "type": "array", + "description": "问答SQL对", + "minItems": 1, + "uniqueItems": true, + "items": { + "type": "object", + "required": [ + "question", + "sql" + ], + "properties": { + "question": { + "type": "string", + "description": "问题" + }, + "sql":{ + "type": "string", + "description": "语义SQL" + } + } + } + }, + "synonyms_pairs": { + "type": "array", + "description": "同义词对", + "minItems": 1, + "uniqueItems": true, + "items": { + "type": "object", + "required": [ + "word", + "synonyms" + ], + "properties": { + "word": { + "type": "string", + "description": "单词" + }, + "synonyms":{ + "type": "array", + "minItems": 1, + "uniqueItems": true, + "items": { + "type": "string" + }, + "description": "同义词" + } + } + } + }, + "knowledge": { + "type": "array", + "description": "业务知识", + "minItems": 1, + "uniqueItems": true, + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "anyOf": [ + { + "required": ["sql_pairs"] + }, + { + "required": ["synonyms_pairs"] + }, + { + "required": ["knowledge"] + } + ] } }, "additionalProperties": false, @@ -449,6 +526,9 @@ }, { "required": ["seeds"] + }, + { + "required": ["examples"] } ] } \ No newline at end of file