From 5c37b9cef307bf795c4bc49dc7ab21de0c8ba97b Mon Sep 17 00:00:00 2001 From: syuilo Date: Fri, 8 Sep 2017 22:10:25 +0900 Subject: Implement #771 --- src/tools/analysis/extract-user-domains.ts | 120 +++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 src/tools/analysis/extract-user-domains.ts (limited to 'src/tools') diff --git a/src/tools/analysis/extract-user-domains.ts b/src/tools/analysis/extract-user-domains.ts new file mode 100644 index 0000000000..bc120f5c17 --- /dev/null +++ b/src/tools/analysis/extract-user-domains.ts @@ -0,0 +1,120 @@ +import * as URL from 'url'; + +import Post from '../../api/models/post'; +import User from '../../api/models/user'; +import parse from '../../api/common/text'; + +process.on('unhandledRejection', console.dir); + +function tokenize(text: string) { + if (text == null) return []; + + // パース + const ast = parse(text); + + const domains = ast + // URLを抽出 + .filter(t => t.type == 'url' || t.type == 'link') + .map(t => URL.parse(t.url).hostname); + + return domains; +} + +// Fetch all users +User.find({}, { + fields: { + _id: true + } +}).then(users => { + let i = -1; + + const x = cb => { + if (++i == users.length) return cb(); + extractDomainsOne(users[i]._id).then(() => x(cb), err => { + console.error(err); + setTimeout(() => { + i--; + x(cb); + }, 1000); + }); + }; + + x(() => { + console.log('complete'); + }); +}); + +function extractDomainsOne(id) { + return new Promise(async (resolve, reject) => { + process.stdout.write(`extracting domains of ${id} ...`); + + // Fetch recent posts + const recentPosts = await Post.find({ + user_id: id, + text: { + $exists: true + } + }, { + sort: { + _id: -1 + }, + limit: 10000, + fields: { + _id: false, + text: true + } + }); + + // 投稿が少なかったら中断 + if (recentPosts.length < 100) { + process.stdout.write(' >>> -\n'); + return resolve(); + } + + const domains = {}; + + // Extract domains from recent posts + recentPosts.forEach(post => { + const domainsOfPost = tokenize(post.text); + + domainsOfPost.forEach(domain => { + if (domains[domain]) { + domains[domain]++; + } else { + domains[domain] = 1; + } + }); + }); + + // Calc peak + let peak = 0; + Object.keys(domains).forEach(domain => { + if (domains[domain] > peak) peak = domains[domain]; + }); + + // Sort domains by frequency + const domainsSorted = Object.keys(domains).sort((a, b) => domains[b] - domains[a]); + + // Lookup top 10 domains + const topDomains = domainsSorted.slice(0, 10); + + process.stdout.write(' >>> ' + topDomains.join(', ') + '\n'); + + // Make domains object (includes weights) + const domainsObj = topDomains.map(domain => ({ + domain: domain, + weight: domains[domain] / peak + })); + + // Save + User.update({ _id: id }, { + $set: { + domains: domainsObj + } + }).then(() => { + resolve(); + }, err => { + reject(err); + }); + }); +} -- cgit v1.2.3-freya