summaryrefslogtreecommitdiff
path: root/src/tools/analysis
diff options
context:
space:
mode:
authorsyuilo <syuilotan@yahoo.co.jp>2017-09-08 22:10:25 +0900
committersyuilo <syuilotan@yahoo.co.jp>2017-09-08 22:10:25 +0900
commit5c37b9cef307bf795c4bc49dc7ab21de0c8ba97b (patch)
tree2b322eaebbb0ff5e3eafdf572014f49cd8cfa7a8 /src/tools/analysis
parentMerge branch 'master' of https://github.com/syuilo/misskey (diff)
downloadsharkey-5c37b9cef307bf795c4bc49dc7ab21de0c8ba97b.tar.gz
sharkey-5c37b9cef307bf795c4bc49dc7ab21de0c8ba97b.tar.bz2
sharkey-5c37b9cef307bf795c4bc49dc7ab21de0c8ba97b.zip
Implement #771
Diffstat (limited to 'src/tools/analysis')
-rw-r--r--src/tools/analysis/extract-user-domains.ts120
1 files changed, 120 insertions, 0 deletions
diff --git a/src/tools/analysis/extract-user-domains.ts b/src/tools/analysis/extract-user-domains.ts
new file mode 100644
index 0000000000..bc120f5c17
--- /dev/null
+++ b/src/tools/analysis/extract-user-domains.ts
@@ -0,0 +1,120 @@
+import * as URL from 'url';
+
+import Post from '../../api/models/post';
+import User from '../../api/models/user';
+import parse from '../../api/common/text';
+
+process.on('unhandledRejection', console.dir);
+
+function tokenize(text: string) {
+ if (text == null) return [];
+
+ // パース
+ const ast = parse(text);
+
+ const domains = ast
+ // URLを抽出
+ .filter(t => t.type == 'url' || t.type == 'link')
+ .map(t => URL.parse(t.url).hostname);
+
+ return domains;
+}
+
+// Fetch all users
+User.find({}, {
+ fields: {
+ _id: true
+ }
+}).then(users => {
+ let i = -1;
+
+ const x = cb => {
+ if (++i == users.length) return cb();
+ extractDomainsOne(users[i]._id).then(() => x(cb), err => {
+ console.error(err);
+ setTimeout(() => {
+ i--;
+ x(cb);
+ }, 1000);
+ });
+ };
+
+ x(() => {
+ console.log('complete');
+ });
+});
+
+function extractDomainsOne(id) {
+ return new Promise(async (resolve, reject) => {
+ process.stdout.write(`extracting domains of ${id} ...`);
+
+ // Fetch recent posts
+ const recentPosts = await Post.find({
+ user_id: id,
+ text: {
+ $exists: true
+ }
+ }, {
+ sort: {
+ _id: -1
+ },
+ limit: 10000,
+ fields: {
+ _id: false,
+ text: true
+ }
+ });
+
+ // 投稿が少なかったら中断
+ if (recentPosts.length < 100) {
+ process.stdout.write(' >>> -\n');
+ return resolve();
+ }
+
+ const domains = {};
+
+ // Extract domains from recent posts
+ recentPosts.forEach(post => {
+ const domainsOfPost = tokenize(post.text);
+
+ domainsOfPost.forEach(domain => {
+ if (domains[domain]) {
+ domains[domain]++;
+ } else {
+ domains[domain] = 1;
+ }
+ });
+ });
+
+ // Calc peak
+ let peak = 0;
+ Object.keys(domains).forEach(domain => {
+ if (domains[domain] > peak) peak = domains[domain];
+ });
+
+ // Sort domains by frequency
+ const domainsSorted = Object.keys(domains).sort((a, b) => domains[b] - domains[a]);
+
+ // Lookup top 10 domains
+ const topDomains = domainsSorted.slice(0, 10);
+
+ process.stdout.write(' >>> ' + topDomains.join(', ') + '\n');
+
+ // Make domains object (includes weights)
+ const domainsObj = topDomains.map(domain => ({
+ domain: domain,
+ weight: domains[domain] / peak
+ }));
+
+ // Save
+ User.update({ _id: id }, {
+ $set: {
+ domains: domainsObj
+ }
+ }).then(() => {
+ resolve();
+ }, err => {
+ reject(err);
+ });
+ });
+}