summaryrefslogtreecommitdiff
path: root/src/tools
diff options
context:
space:
mode:
authorsyuilo <syuilotan@yahoo.co.jp>2017-09-07 13:19:14 +0900
committersyuilo <syuilotan@yahoo.co.jp>2017-09-07 13:19:14 +0900
commitc5bb7dabf92ba92015b24bb3f03c4788d6707222 (patch)
tree205f29ec126892d756c9260464bc0e70a5416619 /src/tools
parentClean up (diff)
downloadsharkey-c5bb7dabf92ba92015b24bb3f03c4788d6707222.tar.gz
sharkey-c5bb7dabf92ba92015b24bb3f03c4788d6707222.tar.bz2
sharkey-c5bb7dabf92ba92015b24bb3f03c4788d6707222.zip
Add analysis script
Diffstat (limited to 'src/tools')
-rw-r--r--src/tools/ai/extract-user-keywords.ts94
1 files changed, 94 insertions, 0 deletions
diff --git a/src/tools/ai/extract-user-keywords.ts b/src/tools/ai/extract-user-keywords.ts
new file mode 100644
index 0000000000..9f21ae2e17
--- /dev/null
+++ b/src/tools/ai/extract-user-keywords.ts
@@ -0,0 +1,94 @@
+const MeCab = require('mecab-async');
+
+import Post from '../../api/models/post';
+import User from '../../api/models/user';
+import config from '../../conf';
+
+const mecab = new MeCab();
+if (config.categorizer.mecab_command) mecab.command = config.categorizer.mecab_command;
+
+function tokenize(text: string) {
+ const tokens = this.mecab.parseSync(text)
+ // キーワードのみ
+ .filter(token => token[1] == '名詞' && (token[2] == '固有名詞' || token[2] == '一般'))
+ // 取り出し
+ .map(token => token[0]);
+
+ return tokens;
+}
+
+// Fetch all users
+User.find({}, {
+ fields: {
+ _id: true
+ }
+}).then(users => {
+ let i = -1;
+
+ const x = cb => {
+ if (++i == users.length) return cb();
+ extractKeywordsOne(users[i]._id, () => x(cb));
+ };
+
+ x(() => {
+ console.log('complete');
+ });
+});
+
+async function extractKeywordsOne(id, cb) {
+ console.log(`extract keywords of ${id} ...`);
+
+ // Fetch recent posts
+ const recentPosts = await Post.find({
+ user_id: id,
+ text: {
+ $exists: true
+ }
+ }, {
+ sort: {
+ _id: -1
+ },
+ limit: 1000,
+ fields: {
+ _id: false,
+ text: true
+ }
+ });
+
+ // 投稿が少なかったら中断
+ if (recentPosts.length < 10) {
+ return cb();
+ }
+
+ const keywords = {};
+
+ // Extract keywords from recent posts
+ recentPosts.forEach(post => {
+ const keywordsOfPost = tokenize(post.text);
+
+ keywordsOfPost.forEach(keyword => {
+ if (keywords[keyword]) {
+ keywords[keyword]++;
+ } else {
+ keywords[keyword] = 1;
+ }
+ });
+ });
+
+ // Sort keywords by frequency
+ const keywordsSorted = Object.keys(keywords).sort((a, b) => keywords[b] - keywords[a]);
+
+ // Lookup top 10 keywords
+ const topKeywords = keywordsSorted.slice(0, 10);
+
+ process.stdout.write(' >>> ' + topKeywords.join(' '));
+
+ // Save
+ User.update({ _id: id }, {
+ $set: {
+ keywords: topKeywords
+ }
+ }).then(() => {
+ cb();
+ });
+}