{"id":6205,"date":"2024-07-01T14:29:37","date_gmt":"2024-07-01T06:29:37","guid":{"rendered":"https:\/\/t.n-years.com\/?p=6205"},"modified":"2024-07-01T14:29:39","modified_gmt":"2024-07-01T06:29:39","slug":"nltk-%e5%ba%93%e7%9a%84%e4%bd%9c%e7%94%a8%e5%8f%8a%e4%b8%bb%e8%a6%81%e5%8a%9f%e8%83%bd","status":"publish","type":"post","link":"https:\/\/t.n-years.com\/?p=6205","title":{"rendered":"NLTK \u5e93\u7684\u4f5c\u7528\u53ca\u4e3b\u8981\u529f\u80fd"},"content":{"rendered":"\n<p class=\"wp-block-paragraph\">\u975e\u539f\u521b\uff0cChatGPT \u539f\u521b\u751f\u6210<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">NLTK\uff08Natural Language Toolkit\uff09\u662f\u4e00\u4e2a\u7528\u4e8e\u81ea\u7136\u8bed\u8a00\u5904\u7406\uff08NLP\uff09\u7684\u5f00\u6e90 Python \u5e93\uff0c\u5e7f\u6cdb\u5e94\u7528\u4e8e\u7814\u7a76\u548c\u5f00\u53d1\u8bed\u8a00\u5904\u7406\u4efb\u52a1\u3002\u5b83\u63d0\u4f9b\u4e86\u4e00\u7cfb\u5217\u4e30\u5bcc\u7684\u5de5\u5177\u548c\u8d44\u6e90\uff0c\u65b9\u4fbf\u7528\u6237\u5904\u7406\u548c\u5206\u6790\u6587\u672c\u6570\u636e\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">NLTK \u7684\u4e3b\u8981\u4f5c\u7528<\/h3>\n\n\n\n<ol class=\"wp-block-list\">\n<li><strong>\u6587\u672c\u9884\u5904\u7406<\/strong>\uff1a\u63d0\u4f9b\u5404\u79cd\u5de5\u5177\u7528\u4e8e\u6587\u672c\u7684\u6e05\u6d17\u3001\u6807\u8bb0\u3001\u5206\u8bcd\u3001\u8bcd\u5e72\u63d0\u53d6\u3001\u8bcd\u5f62\u8fd8\u539f\u7b49\u3002<\/li>\n\n\n\n<li><strong>\u8bcd\u6c47\u8d44\u6e90<\/strong>\uff1a\u5305\u542b\u5927\u91cf\u7684\u8bed\u6599\u5e93\u548c\u8bcd\u6c47\u8d44\u6e90\uff0c\u5982\u8bcd\u5178\u3001\u540c\u4e49\u8bcd\u5e93\u7b49\u3002<\/li>\n\n\n\n<li><strong>\u8bed\u8a00\u5efa\u6a21<\/strong>\uff1a\u652f\u6301 n-gram \u6a21\u578b\u3001\u6982\u7387\u8bed\u8a00\u6a21\u578b\u7b49\u3002<\/li>\n\n\n\n<li><strong>\u6587\u672c\u5206\u7c7b<\/strong>\uff1a\u63d0\u4f9b\u5206\u7c7b\u5668\u548c\u5206\u7c7b\u7b97\u6cd5\uff0c\u7528\u4e8e\u6587\u672c\u5206\u7c7b\u4efb\u52a1\u3002<\/li>\n\n\n\n<li><strong>\u8bed\u6cd5\u5206\u6790<\/strong>\uff1a\u652f\u6301\u8bcd\u6027\u6807\u6ce8\u3001\u53e5\u6cd5\u5206\u6790\u3001\u4f9d\u5b58\u5206\u6790\u7b49\u3002<\/li>\n\n\n\n<li><strong>\u4fe1\u606f\u63d0\u53d6<\/strong>\uff1a\u63d0\u4f9b\u547d\u540d\u5b9e\u4f53\u8bc6\u522b\u3001\u5173\u7cfb\u62bd\u53d6\u7b49\u5de5\u5177\u3002<\/li>\n<\/ol>\n\n\n\n<h3 class=\"wp-block-heading\">NLTK \u7684\u4e3b\u8981\u529f\u80fd\u548c\u7528\u6cd5<\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">1. \u8bed\u6599\u5e93\u548c\u8bcd\u6c47\u8d44\u6e90<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">NLTK \u63d0\u4f9b\u4e86\u4e30\u5bcc\u7684\u8bed\u6599\u5e93\u548c\u8bcd\u6c47\u8d44\u6e90\uff0c\u65b9\u4fbf\u7528\u6237\u8fdb\u884c\u8bed\u8a00\u7814\u7a76\u548c\u5f00\u53d1\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import nltk\nfrom nltk.corpus import brown\n\n# \u4e0b\u8f7d\u8bed\u6599\u5e93\nnltk.download('brown')\n\n# \u4f7f\u7528\u8bed\u6599\u5e93\nprint(brown.words())\nprint(brown.sents())<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">2. \u6587\u672c\u9884\u5904\u7406<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">NLTK \u63d0\u4f9b\u4e86\u591a\u79cd\u6587\u672c\u9884\u5904\u7406\u5de5\u5177\uff0c\u5982\u5206\u8bcd\u3001\u8bcd\u6027\u6807\u6ce8\u3001\u8bcd\u5e72\u63d0\u53d6\u548c\u8bcd\u5f62\u8fd8\u539f\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>from nltk.tokenize import word_tokenize, sent_tokenize\nfrom nltk.stem import PorterStemmer\nfrom nltk.stem import WordNetLemmatizer\nfrom nltk import pos_tag\n\n# \u4e0b\u8f7d\u5fc5\u8981\u8d44\u6e90\nnltk.download('punkt')\nnltk.download('averaged_perceptron_tagger')\nnltk.download('wordnet')\n\n# \u5206\u8bcd\ntext = \"NLTK is a leading platform for building Python programs to work with human language data.\"\ntokens = word_tokenize(text)\nprint(tokens)\n\n# \u8bcd\u6027\u6807\u6ce8\ntagged_tokens = pos_tag(tokens)\nprint(tagged_tokens)\n\n# \u8bcd\u5e72\u63d0\u53d6\nstemmer = PorterStemmer()\nstems = &#91;stemmer.stem(token) for token in tokens]\nprint(stems)\n\n# \u8bcd\u5f62\u8fd8\u539f\nlemmatizer = WordNetLemmatizer()\nlemmas = &#91;lemmatizer.lemmatize(token) for token in tokens]\nprint(lemmas)<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">3. \u53e5\u6cd5\u5206\u6790<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">NLTK \u63d0\u4f9b\u4e86\u8bcd\u6027\u6807\u6ce8\u3001\u53e5\u6cd5\u5206\u6790\u7b49\u5de5\u5177\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>from nltk import CFG\n\n# \u5b9a\u4e49\u4e0a\u4e0b\u6587\u65e0\u5173\u6587\u6cd5\ngrammar = CFG.fromstring(\"\"\"\n    S -&gt; NP VP\n    NP -&gt; DT NN\n    VP -&gt; VB NP\n    DT -&gt; 'the'\n    NN -&gt; 'dog' | 'cat'\n    VB -&gt; 'chased' | 'saw'\n\"\"\")\n\n# \u89e3\u6790\u5668\nparser = nltk.ChartParser(grammar)\nsentence = word_tokenize(\"the dog chased the cat\")\nfor tree in parser.parse(sentence):\n    print(tree)\n    tree.draw()<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">4. \u6587\u672c\u5206\u7c7b<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">NLTK \u63d0\u4f9b\u4e86\u5404\u79cd\u5206\u7c7b\u5668\u548c\u6587\u672c\u5206\u7c7b\u7684\u5de5\u5177\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>from nltk.classify import NaiveBayesClassifier\nfrom nltk.corpus import movie_reviews\n\n# \u4e0b\u8f7d\u5fc5\u8981\u8d44\u6e90\nnltk.download('movie_reviews')\n\n# \u7279\u5f81\u63d0\u53d6\ndef extract_features(words):\n    return {word: True for word in words}\n\n# \u83b7\u53d6\u6570\u636e\u96c6\ndocuments = &#91;(list(movie_reviews.words(fileid)), category)\n             for category in movie_reviews.categories()\n             for fileid in movie_reviews.fileids(category)]\n\n# \u5212\u5206\u8bad\u7ec3\u548c\u6d4b\u8bd5\u96c6\ntrain_set = &#91;(extract_features(d), c) for (d, c) in documents&#91;:1600]]\ntest_set = &#91;(extract_features(d), c) for (d, c) in documents&#91;1600:]]\n\n# \u8bad\u7ec3\u5206\u7c7b\u5668\nclassifier = NaiveBayesClassifier.train(train_set)\n\n# \u6d4b\u8bd5\u5206\u7c7b\u5668\naccuracy = nltk.classify.accuracy(classifier, test_set)\nprint(f'Accuracy: {accuracy}')\n\n# \u663e\u793a\u5206\u7c7b\u5668\u7684\u6700\u91cd\u8981\u7279\u5f81\nclassifier.show_most_informative_features(10)<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">\u7ed3\u8bba<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">NLTK \u662f\u4e00\u4e2a\u5f3a\u5927\u4e14\u7075\u6d3b\u7684\u81ea\u7136\u8bed\u8a00\u5904\u7406\u5de5\u5177\u5305\uff0c\u63d0\u4f9b\u4e86\u4e30\u5bcc\u7684\u5de5\u5177\u548c\u8d44\u6e90\uff0c\u6db5\u76d6\u4e86\u4ece\u6587\u672c\u9884\u5904\u7406\u5230\u9ad8\u7ea7\u8bed\u8a00\u5efa\u6a21\u548c\u5206\u6790\u7684\u5404\u4e2a\u65b9\u9762\u3002\u5b83\u9002\u7528\u4e8e\u5404\u79cd NLP \u4efb\u52a1\uff0c\u5305\u62ec\u6587\u672c\u5206\u7c7b\u3001\u4fe1\u606f\u63d0\u53d6\u3001\u8bed\u6cd5\u5206\u6790\u7b49\uff0c\u662f\u7814\u7a76\u548c\u5f00\u53d1 NLP \u5e94\u7528\u7684\u5229\u5668\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u975e\u539f\u521b\uff0cChatGPT \u539f\u521b\u751f\u6210 NLTK\uff08Natural Language Toolkit\uff09\u662f\u4e00\u4e2a\u7528\u4e8e\u81ea\u7136\u8bed&#8230; <\/p>\n<div class=\"read-more navbutton\"><a href=\"https:\/\/t.n-years.com\/?p=6205\">\u9605\u8bfb\u66f4\u591a<i class=\"fa fa-angle-double-right\"><\/i><\/a><\/div>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[5],"tags":[],"class_list":["post-6205","post","type-post","status-publish","format-standard","hentry","category-5"],"_links":{"self":[{"href":"https:\/\/t.n-years.com\/index.php?rest_route=\/wp\/v2\/posts\/6205","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/t.n-years.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/t.n-years.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/t.n-years.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/t.n-years.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=6205"}],"version-history":[{"count":1,"href":"https:\/\/t.n-years.com\/index.php?rest_route=\/wp\/v2\/posts\/6205\/revisions"}],"predecessor-version":[{"id":6206,"href":"https:\/\/t.n-years.com\/index.php?rest_route=\/wp\/v2\/posts\/6205\/revisions\/6206"}],"wp:attachment":[{"href":"https:\/\/t.n-years.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=6205"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/t.n-years.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=6205"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/t.n-years.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=6205"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}