# Organize our users' schoolNames. database = require '../server/commons/database' mongoose = require 'mongoose' log = require 'winston' async = require 'async' moment = require 'moment' ### SET UP ### do (setupLodash = this) -> GLOBAL._ = require 'lodash' _.str = require 'underscore.string' _.mixin _.str.exports() GLOBAL.tv4 = require('tv4').tv4 database.connect() UserHandler = require '../server/users/user_handler' User = require '../server/users/User' startDate = new Date 2015, 11, 1 debugging = false query = dateCreated: {$gt: startDate}, emailLower: {$exists: true} selection = 'name emailLower schoolName courseInstances clans ageRange dateCreated referrer points lastIP hourOfCode preferredLanguage lastLevel' User.find(query).select(selection).lean().exec (err, users) -> usersWithSchools = _.filter users, 'schoolName' log.info "Found #{usersWithSchools.length} users of #{users.length} users registered after #{startDate}." nextPrompt users nextPrompt = (users, question, userToSchool, suggestions) -> # We look for the next top user to classify based on the number of suggestions we can make about what the school name should be. sortUsers users unless userToSchool return console.log('Done.') or process.exit() unless [userToSchool, suggestions] = findUserToSchool users question ?= formatSuggestions userToSchool, suggestions prompt question, (answer) -> answer = answer.trim() return console.log('Bye.') or process.exit() if answer in ['q', 'quit'] if answer is '' return nextPrompt _.without users, userToSchool else unless _.isNaN(num = parseInt(answer, 10)) schoolName = if num then suggestions[num - 1]?.schoolName else userToSchool.schoolName return finalizePrompt userToSchool, suggestions, schoolName, users else if answer.length < 10 console.log "#{answer}? That's kind of short--I don't think school names and locations can be this short. What should it really be?" return nextPrompt users, "> ", userToSchool, suggestions else unless /,.+,/.test answer console.log "#{answer}? We need the full location (with two commas), like Example High School, Springfield, IL. What should it really be?" return nextPrompt users, "> ", userToSchool, suggestions else return finalizePrompt userToSchool, suggestions, answer, users finalizePrompt = (userToSchool, suggestions, schoolName, users) -> console.log "Selected schoolName: \"#{schoolName}\"" question = "Also apply this to other users? Ex.: 'all', '0 1 2 5 9-14', 'all but 38 59-65', '0' to just do this one, q to quit, or blank to retype school name.\n> " prompt question, (answer) -> answer = answer.trim() return console.log('Bye.') or process.exit() if answer in ['q', 'quit'] if answer is '' console.log "Should just do", userToSchool._id, userToSchool.emailLower, userToSchool.schoolName targets = [userToSchool] else if answer is 'all' targets = [userToSchool].concat (s.user for s in suggestions) console.log "Doing all #{targets.length} users..." else if /^all/.test answer numbers = findNumbers answer, suggestions.length targets = [userToSchool].concat (s.user for s in suggestions) for number in numbers skip = if number then suggestions[number - 1].user else userToSchool targets = _.without targets, skip console.log "Doing all #{targets.length} users without #{numbers}..." else numbers = findNumbers answer, suggestions.length targets = _.filter ((if number then suggestions[number - 1].user else userToSchool) for number in numbers) console.log "Doing #{targets.length} users for #{numbers}..." User.update {_id: {$in: (_.map targets, '_id')}}, {schoolName: schoolName}, {multi: true}, (err, result) -> #User.update {_id: {$in: []}}, {schoolName: schoolName}, {multi: true}, (err, result) -> if err console.error "Ran into error doing the save:", err return finalizePrompt userToSchool, suggestions, schoolName, users console.log "Updated users' schoolNames. Result:", result # Take these users out of the pool to make suggestions about before going on to next suggestions. remainingUsers = _.without users, targets... nextPrompt remainingUsers findNumbers = (answer, max) -> numbers = (parseInt(d, 10) for d in (' ' + answer + ' ').replace(/ /g, ' ').match(/ (\d+) /g) ? []) ranges = answer.match(/(\d+-\d+)/g) or [] for range in ranges bounds = (parseInt(d, 10) for d in range.split('-')) for number in [bounds[0] .. bounds[1]] numbers.push number for number in numbers if number > max console.log "Incorrect number #{number} higher than max: #{max}" numbers formatUser = (user, relativeToUser) -> # TODO: replace date string with relative time since signup compared to target user, and actually make suggestions based on students that signed up at almost the same time props = _.pick(user, ['name', 'emailLower', 'ageRange', 'dateCreated', 'lastLevel', 'points', 'referrer', 'hourOfCode']) props.dateCreated = if relativeToUser then moment(props.dateCreated).from(relativeToUser.dateCreated) else moment(props.dateCreated).fromNow() _.values(props).join(' ') formatSuggestions = (userToSchool, suggestions) -> suggestionPrompts = ("#{_.str.rpad(i + 1, 3)} #{_.str.rpad(s.schoolName, 50)} #{s.reasons.length} #{if s.reasons.length > 1 then 'Matches' else 'Match'}: #{s.reasons.join(', ')}\tfrom user: #{formatUser(s.user, userToSchool)}" for s, i in suggestions).join('\n') """ What should the school for this user be? 0 #{_.str.rpad(userToSchool.schoolName, 50)} #{formatUser(userToSchool)} Suggestions: #{suggestionPrompts} Choose a number, type a name, enter to skip, or q to quit. > """ checkedTopGroups = {} findUserToSchool = (users) -> # We find the top user from the top group that we can make the most reasoned suggestions about what the school name would be. [bestTarget, bestTargetSuggestions, mostReasons, bestGroup] = [null, [], 0, null] for field, groups of topGroups for nextLargestGroup in groups when not checkedTopGroups[nextLargestGroup] possibleTargets = userCategories[field][nextLargestGroup] schoolNames = (t.schoolName for t in _.uniq possibleTargets, 'schoolName') # TODO: better method to avoid showing users where everyone in the suggestion already has the same school (because we have already done this group) alreadyDone = false for schoolName in schoolNames when schoolName?.length > 10 and /,.+,/.test schoolName # Long enough school name with location info (two commas) sharedCount = _.filter(possibleTargets, schoolName: schoolName).length if sharedCount > 0.5 * possibleTargets.length console.log 'Already done', schoolName, sharedCount, possibleTargets.length, 'for', field, nextLargestGroup alreadyDone = true continue if alreadyDone nSamples = Math.min 15, Math.max(4, Math.floor possibleTargets.length / 20) if debugging then console.log 'Checking', nSamples, 'samples of', possibleTargets.length, 'players in the biggest', field, 'group:', nextLargestGroup for i in [0 ... nSamples] target = possibleTargets[Math.floor i * possibleTargets.length / (nSamples + 1)] suggestions = findSuggestions target reasons = _.reduce suggestions, ((sum, suggestion) -> for suggestion in suggestions for reason in suggestion.reasons sum += switch reason when 'Course instances' then 50 when 'IP' then 40 when 'Name' then 30 when 'Referrer' then 20 when 'Domain' then (if getDomain(target) is 'cps.edu' then 1 else 10) when 'Clans' then 0.01 sum ), 0 if reasons > mostReasons bestTarget = target bestTargetSuggestions = suggestions mostReasons = reasons bestGroup = nextLargestGroup break checkedTopGroups[bestGroup] = true return [bestTarget, bestTargetSuggestions] findSuggestions = (target) -> # Look for other users with the same IP, course instances, clans, or similar school names or non-common shared email domains. suggestions = [] t0 = new Date() if debugging then console.log ' Checking suggestions for', target.emailLower, target.schoolName, (new Date()) - t0 if target.lastIP for otherUser in (userCategories.lastIP[target.lastIP] ? []) when otherUser isnt target suggestions.push schoolName: otherUser.schoolName, reasons: ['IP'], user: otherUser for leagueType in ['courseInstances', 'clans'] if debugging then console.log ' Now checking', leagueType, (new Date()) - t0 if target[leagueType]?.length for league in target[leagueType] for otherUser in (userCategories[leagueType][league] ? []) when otherUser isnt target reason = _.str.humanize(leagueType) if existingSuggestion = _.find(suggestions, user: otherUser) existingSuggestion.reasons.push reason else suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser if target.schoolName?.length > 5 if debugging then console.log ' Now checking schoolName', (new Date()) - t0 nameMatches = [] for otherSchoolName in topGroups.schoolName score = stringScore otherSchoolName, target.schoolName, 0.8 continue if score < 0.25 nameMatches.push schoolName: otherSchoolName, score: score nameMatches = (match.schoolName for match in (_.sortBy nameMatches, (match) -> -match.score)) for match in nameMatches.slice(0, 10) reason = "Name" for otherUser in (userCategories.schoolName[match] ? []) when otherUser isnt target if existingSuggestion = _.find(suggestions, user: otherUser) existingSuggestion.reasons.push reason else suggestions.push schoolName: match, reasons: [reason], user: otherUser if debugging then console.log ' Now checking domain', (new Date()) - t0 if domain = getDomain target for otherUser in (userCategories.domain[domain] ? []) when otherUser isnt target reason = "Domain" if existingSuggestion = _.find(suggestions, user: otherUser) existingSuggestion.reasons.push reason else suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser if debugging then console.log ' Now checking referrer', (new Date()) - t0 if referrer = getReferrer target for otherUser in (userCategories.referrer[referrer] ? []) when otherUser isnt target reason = "Referrer" if existingSuggestion = _.find(suggestions, user: otherUser) existingSuggestion.reasons.push reason else suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser if debugging then console.log ' Done checking referrer', (new Date()) - t0 suggestions = _.sortBy suggestions, 'schoolName' suggestions = _.sortBy suggestions, (s) -> -s.reasons.length return suggestions userCategories = {} topGroups = {} usersCategorized = {} sortUsers = (users) -> users = _.sortBy users, (u) -> -u.points users = _.sortBy users, ['schoolName', 'lastIP'] for field in ['courseInstances', 'lastIP', 'schoolName', 'domain', 'clans', 'referrer'] userCategories[field] = categorizeUsers users, field topGroups[field] = _.sortBy _.keys(userCategories[field]), (key) -> -userCategories[field][key].length topGroups[field] = (group for group in topGroups[field] when 2 < userCategories[field][group].length < (if field is 'clans' then 30 else 5000)) categorizeUsers = (users, field) -> categories = {} for user in users if field is 'domain' value = getDomain user else if field is 'referrer' value = getReferrer user else value = user[field] continue unless value values = if _.isArray(value) then value else [value] for value in values when value continue if value.trim and not value = value.trim() categories[value] ?= [] categories[value].push user categories typoCache = {} getDomain = (user) -> return null unless domain = user.emailLower.split('@')[1] return null if commonEmailDomainMap[domain] # Too slow? Is this actually slow? #typo = typoCache[domain] #return null if typo #return domain if typo is false #typo = _.find commonEmailDomains, (commonDomain) -> stringScore(commonDomain, domain, 0.8) > 0.9 #typoCache[domain] = Boolean(typo) #return null if typo domain commonReferrersRegex = /(google|bing\.|yahoo|duckduckgo|jobs\.lever|code\.org|twitter|facebook|dollarclick|stumbleupon|vk\.com|playpcesor|reddit|lifehacker|favorite|bnext|freelance|taringa|blogthinkbig|graphism|inside\.com|korben|habrahabr|iplaysoft|geekbrains|playground|ycombinator|github)/ getReferrer = (user) -> return null unless referrer = user.referrer?.toLowerCase().trim() referrer = referrer.replace /^https?:\/\//, '' return null if commonReferrersRegex.test referrer return classCode if classCode = referrer.match(/\?_cc=(\S+)$/)?[1] return null if /codecombat/.test referrer referrer # https://github.com/joshaven/string_score stringScore = (_a, word, fuzziness) -> return 1 if word is _a return 0 if word is "" runningScore = 0 string = _a lString = string.toLowerCase() strLength = string.length lWord = word.toLowerCase() wordLength = word.length startAt = 0 fuzzies = 1 if fuzziness fuzzyFactor = 1 - fuzziness if fuzziness for i in [0...wordLength] idxOf = lString.indexOf lWord[i], startAt if idxOf is -1 fuzzies += fuzzyFactor else if startAt is idxOf charScore = 0.7 else charScore = 0.1 charScore += 0.8 if string[idxOf - 1] is ' ' charScore += 0.1 if string[idxOf] is word[i] runningScore += charScore startAt = idxOf + 1 else for i in [0...wordLength] idxOf = lString.indexOf lWord[i], startAt return 0 if idxOf is -1 if startAt is idxOf charScore = 0.7 else charScore = 0.1 charScore += 0.8 if string[idxOf - 1] is word[i] runningScore += charScore startAt = idxOf + 1 finalScore = 0.5 * (runningScore / strLength + runningScore / wordLength) / fuzzies finalScore += 0.15 if lWord[0] is lString[0] and finalScore < 0.85 finalScore prompt = (question, callback) -> process.stdin.resume() process.stdout.write question process.stdin.once 'data', (data) -> callback data.toString().trim() # https://github.com/mailcheck/mailcheck/wiki/List-of-Popular-Domains commonEmailDomains = [ # Default domains included "aol.com", "att.net", "comcast.net", "facebook.com", "gmail.com", "gmx.com", "googlemail.com", "google.com", "hotmail.com", "hotmail.co.uk", "mac.com", "me.com", "mail.com", "msn.com", "live.com", "sbcglobal.net", "verizon.net", "yahoo.com", "yahoo.co.uk", # Other global domains "email.com", "games.com", "gmx.net", "hush.com", "hushmail.com", "icloud.com", "inbox.com", "lavabit.com", "love.com", "outlook.com", "pobox.com", "rocketmail.com", "safe-mail.net", "wow.com", "ygm.com", "ymail.com", "zoho.com", "fastmail.fm", # United States ISP domains "bellsouth.net", "charter.net", "comcast.net", "cox.net", "earthlink.net", "juno.com", # British ISP domains "btinternet.com", "virginmedia.com", "blueyonder.co.uk", "freeserve.co.uk", "live.co.uk", "ntlworld.com", "o2.co.uk", "orange.net", "sky.com", "talktalk.co.uk", "tiscali.co.uk", "virgin.net", "wanadoo.co.uk", "bt.com", # Domains used in Asia "sina.com", "qq.com", "naver.com", "hanmail.net", "daum.net", "nate.com", "yahoo.co.jp", "yahoo.co.kr", "yahoo.co.id", "yahoo.co.in", "yahoo.com.sg", "yahoo.com.ph", "yahoo.com.tw" # French ISP domains "hotmail.fr", "live.fr", "laposte.net", "yahoo.fr", "wanadoo.fr", "orange.fr", "gmx.fr", "sfr.fr", "neuf.fr", "free.fr", # German ISP domains "gmx.de", "hotmail.de", "live.de", "online.de", "t-online.de", "web.de", "yahoo.de", # Russian ISP domains "mail.ru", "rambler.ru", "yandex.ru", "ya.ru", "list.ru", # Belgian ISP domains "hotmail.be", "live.be", "skynet.be", "voo.be", "tvcablenet.be", "telenet.be", # Argentinian ISP domains "hotmail.com.ar", "live.com.ar", "yahoo.com.ar", "fibertel.com.ar", "speedy.com.ar", "arnet.com.ar", # Domains used in Mexico "hotmail.com", "gmail.com", "yahoo.com.mx", "live.com.mx", "yahoo.com", "hotmail.es", "live.com", "hotmail.com.mx", "prodigy.net.mx", "msn.com" ] commonEmailDomainMap = {} commonEmailDomainMap[domain] = true for domain in commonEmailDomains