summaryrefslogtreecommitdiff
path: root/src/tools/analysis/extract-user-domains.ts
blob: bc120f5c17f75b9ac97f1aea23fb2319dbf9c97f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import * as URL from 'url';

import Post from '../../api/models/post';
import User from '../../api/models/user';
import parse from '../../api/common/text';

process.on('unhandledRejection', console.dir);

function tokenize(text: string) {
	if (text == null) return [];

	// パース
	const ast = parse(text);

	const domains = ast
		// URLを抽出
		.filter(t => t.type == 'url' || t.type == 'link')
		.map(t => URL.parse(t.url).hostname);

	return domains;
}

// Fetch all users
User.find({}, {
	fields: {
		_id: true
	}
}).then(users => {
	let i = -1;

	const x = cb => {
		if (++i == users.length) return cb();
		extractDomainsOne(users[i]._id).then(() => x(cb), err => {
			console.error(err);
			setTimeout(() => {
				i--;
				x(cb);
			}, 1000);
		});
	};

	x(() => {
		console.log('complete');
	});
});

function extractDomainsOne(id) {
	return new Promise(async (resolve, reject) => {
		process.stdout.write(`extracting domains of ${id} ...`);

		// Fetch recent posts
		const recentPosts = await Post.find({
			user_id: id,
			text: {
				$exists: true
			}
		}, {
			sort: {
				_id: -1
			},
			limit: 10000,
			fields: {
				_id: false,
				text: true
			}
		});

		// 投稿が少なかったら中断
		if (recentPosts.length < 100) {
			process.stdout.write(' >>> -\n');
			return resolve();
		}

		const domains = {};

		// Extract domains from recent posts
		recentPosts.forEach(post => {
			const domainsOfPost = tokenize(post.text);

			domainsOfPost.forEach(domain => {
				if (domains[domain]) {
					domains[domain]++;
				} else {
					domains[domain] = 1;
				}
			});
		});

		// Calc peak
		let peak = 0;
		Object.keys(domains).forEach(domain => {
			if (domains[domain] > peak) peak = domains[domain];
		});

		// Sort domains by frequency
		const domainsSorted = Object.keys(domains).sort((a, b) => domains[b] - domains[a]);

		// Lookup top 10 domains
		const topDomains = domainsSorted.slice(0, 10);

		process.stdout.write(' >>> ' + topDomains.join(', ') + '\n');

		// Make domains object (includes weights)
		const domainsObj = topDomains.map(domain => ({
			domain: domain,
			weight: domains[domain] / peak
		}));

		// Save
		User.update({ _id: id }, {
			$set: {
				domains: domainsObj
			}
		}).then(() => {
			resolve();
		}, err => {
			reject(err);
		});
	});
}