From 69940f7d95546be9a081dbca3881a9e43203c61c Mon Sep 17 00:00:00 2001 From: wangbing Date: Thu, 3 Apr 2025 23:32:05 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E5=A4=87=E4=BB=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 17 ++++--- .../java/xyz/wbsite/ai/HanLP_Example.java | 44 +++++++++++++++++++ src/main/java/xyz/wbsite/ai/Helper.java | 13 ++++++ .../wbsite/ai/Qdrant_Embedding_Example.java | 4 +- .../xyz/wbsite/ai/Text_Compare_Example.java | 8 ++-- 5 files changed, 75 insertions(+), 11 deletions(-) create mode 100644 src/main/java/xyz/wbsite/ai/HanLP_Example.java diff --git a/pom.xml b/pom.xml index 1a1cd80..47c2ad6 100644 --- a/pom.xml +++ b/pom.xml @@ -77,11 +77,11 @@ langchain4j-embeddings ${langchain4j.version} - - dev.langchain4j - langchain4j-embeddings-all-minilm-l6-v2 - ${langchain4j.version} - + + + + + dev.langchain4j @@ -102,6 +102,13 @@ 1.0.0-beta2 + + + com.hankcs + hanlp + portable-1.8.4 + + ch.qos.logback diff --git a/src/main/java/xyz/wbsite/ai/HanLP_Example.java b/src/main/java/xyz/wbsite/ai/HanLP_Example.java new file mode 100644 index 0000000..2b3bce7 --- /dev/null +++ b/src/main/java/xyz/wbsite/ai/HanLP_Example.java @@ -0,0 +1,44 @@ +package xyz.wbsite.ai; + +import com.hankcs.hanlp.HanLP; +import com.hankcs.hanlp.corpus.tag.Nature; +import com.hankcs.hanlp.seg.common.Term; +import com.hankcs.hanlp.tokenizer.StandardTokenizer; + +import java.util.List; + +public class HanLP_Example { + public static void main(String[] args) { + +// HanLPTokenizer hanLPTokenizer = new HanLPTokenizer(); +// String[] segment = hanLPTokenizer.segment("我喜欢吃苹果"); +// +// List 我喜欢吃苹果1 = HanLP.extractKeyword("我喜欢吃苹果", 2); +// +// HanLPEngine hanLPEngine = new HanLPEngine(); +// Result parse = hanLPEngine.parse("我喜欢吃苹果"); +// +// +// for (Word word : parse) { +// System.out.println(word); +// } + + List strings = HanLP.extractKeyword("身份证去哪里办理", 1); + List string = HanLP.extractKeyword("需要带什么材料", 1); + + // 分词 + List> lists = StandardTokenizer.seg2sentence("那么还需要哪些材料"); + for (List list : lists) { + for (Term term : list) { + System.out.println(term); + // 检查词性是否为主语相关的词性(例如:主谓宾中的主语通常是名词或代词) +// if (term.nature.equals(Nature.n)) { // 名词 +// System.out.println("主语: " + term); +// } +// if (term.nature.equals(Nature.r)) { // 代词,例如“他”、“她”等 +// System.out.println("主语: " + term); +// } + } + } + } +} diff --git a/src/main/java/xyz/wbsite/ai/Helper.java b/src/main/java/xyz/wbsite/ai/Helper.java index fd7ed29..56a6773 100644 --- a/src/main/java/xyz/wbsite/ai/Helper.java +++ b/src/main/java/xyz/wbsite/ai/Helper.java @@ -3,6 +3,7 @@ package xyz.wbsite.ai; import cn.hutool.core.collection.CollUtil; import dev.langchain4j.data.document.Document; import dev.langchain4j.model.openai.OpenAiChatModel; +import dev.langchain4j.model.openai.OpenAiEmbeddingModel; import dev.langchain4j.model.openai.OpenAiStreamingChatModel; import java.util.List; @@ -25,6 +26,14 @@ public class Helper { .logResponses(true) .build(); + private static OpenAiEmbeddingModel openAiEmbeddingModel = OpenAiEmbeddingModel.builder() + .baseUrl("http://192.168.88.106:11434/v1") + .apiKey("1") + .modelName("bge-m3") + .logRequests(true) + .logResponses(true) + .build(); + private static OpenAiChatModel toolChatModel = OpenAiChatModel.builder() .baseUrl("http://192.168.88.106:11434/v1") .apiKey("1") @@ -53,6 +62,10 @@ public class Helper { return gemmaModel; } + public static OpenAiEmbeddingModel getOpenAiEmbeddingModel() { + return openAiEmbeddingModel; + } + public static Document getDocument() { return Document.from("人往往在做梦的时候会打呼噜"); } diff --git a/src/main/java/xyz/wbsite/ai/Qdrant_Embedding_Example.java b/src/main/java/xyz/wbsite/ai/Qdrant_Embedding_Example.java index 21a3383..326aa70 100644 --- a/src/main/java/xyz/wbsite/ai/Qdrant_Embedding_Example.java +++ b/src/main/java/xyz/wbsite/ai/Qdrant_Embedding_Example.java @@ -3,7 +3,7 @@ package xyz.wbsite.ai; import dev.langchain4j.data.embedding.Embedding; import dev.langchain4j.data.segment.TextSegment; import dev.langchain4j.model.embedding.EmbeddingModel; -import dev.langchain4j.model.embedding.onnx.allminilml6v2.AllMiniLmL6V2EmbeddingModel; +import dev.langchain4j.model.embedding.onnx.bgesmallenv15q.BgeSmallEnV15QuantizedEmbeddingModel; import dev.langchain4j.store.embedding.EmbeddingStore; import dev.langchain4j.store.embedding.qdrant.QdrantEmbeddingStore; @@ -22,7 +22,7 @@ public class Qdrant_Embedding_Example { .collectionName("langchain4j-" + randomUUID()) .build(); - EmbeddingModel embeddingModel = new AllMiniLmL6V2EmbeddingModel(); + EmbeddingModel embeddingModel = new BgeSmallEnV15QuantizedEmbeddingModel(); TextSegment segment1 = TextSegment.from("I've been to France twice."); Embedding embedding1 = embeddingModel.embed(segment1).content(); embeddingStore.add(embedding1, segment1); diff --git a/src/main/java/xyz/wbsite/ai/Text_Compare_Example.java b/src/main/java/xyz/wbsite/ai/Text_Compare_Example.java index ea2aed9..318d597 100644 --- a/src/main/java/xyz/wbsite/ai/Text_Compare_Example.java +++ b/src/main/java/xyz/wbsite/ai/Text_Compare_Example.java @@ -2,7 +2,6 @@ package xyz.wbsite.ai; import dev.langchain4j.data.embedding.Embedding; import dev.langchain4j.model.embedding.EmbeddingModel; -import dev.langchain4j.model.embedding.onnx.allminilml6v2.AllMiniLmL6V2EmbeddingModel; import dev.langchain4j.model.embedding.onnx.bgesmallenv15q.BgeSmallEnV15QuantizedEmbeddingModel; import dev.langchain4j.store.embedding.CosineSimilarity; @@ -13,11 +12,12 @@ public class Text_Compare_Example { public static void main(String[] args) { // 初始化嵌入模型 - EmbeddingModel embeddingModel = new AllMiniLmL6V2EmbeddingModel(); +// EmbeddingModel embeddingModel = new BgeSmallEnV15QuantizedEmbeddingModel(); + EmbeddingModel embeddingModel = Helper.getOpenAiEmbeddingModel(); // 将文本转换为向量 - Embedding embedding1 = embeddingModel.embed("工伤医疗费的申领").content(); - Embedding embedding2 = embeddingModel.embed("预告登记的转移").content(); + Embedding embedding1 = embeddingModel.embed("身份证办理").content(); + Embedding embedding2 = embeddingModel.embed("身份证首次办理").content(); double between = CosineSimilarity.between(embedding1, embedding2); System.out.println("余弦相似度: " + between); // 值越接近1越相似