buildSchoolGraph script working well

This commit is contained in:
Nick Winter 2015-12-09 11:55:03 -08:00
parent 7076a23fe2
commit 0539f796a7

View file

@ -4,6 +4,7 @@ database = require '../server/commons/database'
mongoose = require 'mongoose' mongoose = require 'mongoose'
log = require 'winston' log = require 'winston'
async = require 'async' async = require 'async'
moment = require 'moment'
### SET UP ### ### SET UP ###
do (setupLodash = this) -> do (setupLodash = this) ->
@ -19,6 +20,8 @@ User = require '../server/users/User'
startDate = new Date 2015, 11, 1 startDate = new Date 2015, 11, 1
debugging = false
query = dateCreated: {$gt: startDate}, emailLower: {$exists: true} query = dateCreated: {$gt: startDate}, emailLower: {$exists: true}
selection = 'name emailLower schoolName courseInstances clans ageRange dateCreated referrer points lastIP hourOfCode preferredLanguage lastLevel' selection = 'name emailLower schoolName courseInstances clans ageRange dateCreated referrer points lastIP hourOfCode preferredLanguage lastLevel'
User.find(query).select(selection).lean().exec (err, users) -> User.find(query).select(selection).lean().exec (err, users) ->
@ -26,9 +29,10 @@ User.find(query).select(selection).lean().exec (err, users) ->
log.info "Found #{usersWithSchools.length} users of #{users.length} users registered after #{startDate}." log.info "Found #{usersWithSchools.length} users of #{users.length} users registered after #{startDate}."
nextPrompt users nextPrompt users
nextPrompt = (users, question) -> nextPrompt = (users, question, userToSchool, suggestions) ->
# We look for the next top user to classify based on the number of suggestions we can make about what the school name should be. # We look for the next top user to classify based on the number of suggestions we can make about what the school name should be.
sortUsers users sortUsers users
unless userToSchool
return console.log('Done.') or process.exit() unless [userToSchool, suggestions] = findUserToSchool users return console.log('Done.') or process.exit() unless [userToSchool, suggestions] = findUserToSchool users
question ?= formatSuggestions userToSchool, suggestions question ?= formatSuggestions userToSchool, suggestions
prompt question, (answer) -> prompt question, (answer) ->
@ -41,7 +45,10 @@ nextPrompt = (users, question) ->
return finalizePrompt userToSchool, suggestions, schoolName, users return finalizePrompt userToSchool, suggestions, schoolName, users
else if answer.length < 10 else if answer.length < 10
console.log "#{answer}? That's kind of short--I don't think school names and locations can be this short. What should it really be?" console.log "#{answer}? That's kind of short--I don't think school names and locations can be this short. What should it really be?"
return nextPrompt users, "> " return nextPrompt users, "> ", userToSchool, suggestions
else unless /,.+,/.test answer
console.log "#{answer}? We need the full location (with two commas), like Example High School, Springfield, IL. What should it really be?"
return nextPrompt users, "> ", userToSchool, suggestions
else else
return finalizePrompt userToSchool, suggestions, answer, users return finalizePrompt userToSchool, suggestions, answer, users
@ -79,7 +86,7 @@ finalizePrompt = (userToSchool, suggestions, schoolName, users) ->
nextPrompt remainingUsers nextPrompt remainingUsers
findNumbers = (answer, max) -> findNumbers = (answer, max) ->
numbers = (parseInt(d, 10) for d in (' ' + answer + ' ').match(/ (\d+) /g) ? []) numbers = (parseInt(d, 10) for d in (' ' + answer + ' ').replace(/ /g, ' ').match(/ (\d+) /g) ? [])
ranges = answer.match(/(\d+-\d+)/g) or [] ranges = answer.match(/(\d+-\d+)/g) or []
for range in ranges for range in ranges
bounds = (parseInt(d, 10) for d in range.split('-')) bounds = (parseInt(d, 10) for d in range.split('-'))
@ -90,12 +97,14 @@ findNumbers = (answer, max) ->
console.log "Incorrect number #{number} higher than max: #{max}" console.log "Incorrect number #{number} higher than max: #{max}"
numbers numbers
formatUser = (user) -> formatUser = (user, relativeToUser) ->
# TODO: replace date string with relative time since signup compared to target user, and actually make suggestions based on students that signed up at almost the same time # TODO: replace date string with relative time since signup compared to target user, and actually make suggestions based on students that signed up at almost the same time
_.values(_.pick(user, ['name', 'emailLower', 'ageRange', 'dateCreated', 'lastLevel', 'points', 'referrer', 'hourOfCode'])).join(' ') props = _.pick(user, ['name', 'emailLower', 'ageRange', 'dateCreated', 'lastLevel', 'points', 'referrer', 'hourOfCode'])
props.dateCreated = if relativeToUser then moment(props.dateCreated).from(relativeToUser.dateCreated) else moment(props.dateCreated).fromNow()
_.values(props).join(' ')
formatSuggestions = (userToSchool, suggestions) -> formatSuggestions = (userToSchool, suggestions) ->
suggestionPrompts = ("#{_.str.rpad(i + 1, 3)} #{_.str.rpad(s.schoolName, 50)} #{s.reasons.length} #{if s.reasons.length > 1 then 'Matches' else 'Match'}: #{s.reasons.join(', ')}\tfrom user: #{formatUser(s.user)}" for s, i in suggestions).join('\n') suggestionPrompts = ("#{_.str.rpad(i + 1, 3)} #{_.str.rpad(s.schoolName, 50)} #{s.reasons.length} #{if s.reasons.length > 1 then 'Matches' else 'Match'}: #{s.reasons.join(', ')}\tfrom user: #{formatUser(s.user, userToSchool)}" for s, i in suggestions).join('\n')
""" """
What should the school for this user be? What should the school for this user be?
0 #{_.str.rpad(userToSchool.schoolName, 50)} #{formatUser(userToSchool)} 0 #{_.str.rpad(userToSchool.schoolName, 50)} #{formatUser(userToSchool)}
@ -104,20 +113,24 @@ formatSuggestions = (userToSchool, suggestions) ->
Choose a number, type a name, enter to skip, or q to quit. Choose a number, type a name, enter to skip, or q to quit.
> """ > """
checkedTopGroups = {}
findUserToSchool = (users) -> findUserToSchool = (users) ->
# We find the top user from the top group that we can make the most reasoned suggestions about what the school name would be. # We find the top user from the top group that we can make the most reasoned suggestions about what the school name would be.
[bestTarget, bestTargetSuggestions, mostReasons] = [null, [], 0] [bestTarget, bestTargetSuggestions, mostReasons, bestGroup] = [null, [], 0, null]
for field, groups of topGroups for field, groups of topGroups
for nextLargestGroup in groups for nextLargestGroup in groups when not checkedTopGroups[nextLargestGroup]
possibleTargets = userCategories[field][nextLargestGroup] possibleTargets = userCategories[field][nextLargestGroup]
schoolNames = _.uniq possibleTargets, 'schoolName' schoolNames = (t.schoolName for t in _.uniq possibleTargets, 'schoolName')
# TODO: better method to avoid showing users where everyone in the suggestion already has the same school (because we have already done this group) # TODO: better method to avoid showing users where everyone in the suggestion already has the same school (because we have already done this group)
for schoolName in schoolNames alreadyDone = false
if _.filter(possibleTargets, schoolName: schoolName).length > 0.5 * possibleTargets.length for schoolName in schoolNames when schoolName?.length > 10 and /,.+,/.test schoolName # Long enough school name with location info (two commas)
sharedCount = _.filter(possibleTargets, schoolName: schoolName).length
if sharedCount > 0.5 * possibleTargets.length
console.log 'Already done', schoolName, sharedCount, possibleTargets.length, 'for', field, nextLargestGroup
alreadyDone = true alreadyDone = true
continue if alreadyDone continue if alreadyDone
nSamples = Math.min 15, Math.max(4, Math.floor possibleTargets.length / 20) nSamples = Math.min 15, Math.max(4, Math.floor possibleTargets.length / 20)
console.log 'Checking', nSamples, 'samples of', possibleTargets.length, 'players in the biggest', field, 'group:', nextLargestGroup if debugging then console.log 'Checking', nSamples, 'samples of', possibleTargets.length, 'players in the biggest', field, 'group:', nextLargestGroup
for i in [0 ... nSamples] for i in [0 ... nSamples]
target = possibleTargets[Math.floor i * possibleTargets.length / (nSamples + 1)] target = possibleTargets[Math.floor i * possibleTargets.length / (nSamples + 1)]
suggestions = findSuggestions target suggestions = findSuggestions target
@ -130,26 +143,28 @@ findUserToSchool = (users) ->
when 'Name' then 30 when 'Name' then 30
when 'Referrer' then 20 when 'Referrer' then 20
when 'Domain' then (if getDomain(target) is 'cps.edu' then 1 else 10) when 'Domain' then (if getDomain(target) is 'cps.edu' then 1 else 10)
when 'Clans' then 0.1 when 'Clans' then 0.01
sum sum
), 0 ), 0
if reasons > mostReasons if reasons > mostReasons
bestTarget = target bestTarget = target
bestTargetSuggestions = suggestions bestTargetSuggestions = suggestions
mostReasons = reasons mostReasons = reasons
bestGroup = nextLargestGroup
break break
checkedTopGroups[bestGroup] = true
return [bestTarget, bestTargetSuggestions] return [bestTarget, bestTargetSuggestions]
findSuggestions = (target) -> findSuggestions = (target) ->
# Look for other users with the same IP, course instances, clans, or similar school names or non-common shared email domains. # Look for other users with the same IP, course instances, clans, or similar school names or non-common shared email domains.
suggestions = [] suggestions = []
t0 = new Date() t0 = new Date()
console.log ' Checking suggestions for', target.emailLower, target.schoolName, (new Date()) - t0 if debugging then console.log ' Checking suggestions for', target.emailLower, target.schoolName, (new Date()) - t0
if target.lastIP if target.lastIP
for otherUser in (userCategories.lastIP[target.lastIP] ? []) when otherUser isnt target for otherUser in (userCategories.lastIP[target.lastIP] ? []) when otherUser isnt target
suggestions.push schoolName: otherUser.schoolName, reasons: ['IP'], user: otherUser suggestions.push schoolName: otherUser.schoolName, reasons: ['IP'], user: otherUser
for leagueType in ['courseInstances', 'clans'] for leagueType in ['courseInstances', 'clans']
console.log ' Now checking', leagueType, (new Date()) - t0 if debugging then console.log ' Now checking', leagueType, (new Date()) - t0
if target[leagueType]?.length if target[leagueType]?.length
for league in target[leagueType] for league in target[leagueType]
for otherUser in (userCategories[leagueType][league] ? []) when otherUser isnt target for otherUser in (userCategories[leagueType][league] ? []) when otherUser isnt target
@ -159,7 +174,7 @@ findSuggestions = (target) ->
else else
suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser
if target.schoolName?.length > 5 if target.schoolName?.length > 5
console.log ' Now checking schoolName', (new Date()) - t0 if debugging then console.log ' Now checking schoolName', (new Date()) - t0
nameMatches = [] nameMatches = []
for otherSchoolName in topGroups.schoolName for otherSchoolName in topGroups.schoolName
score = stringScore otherSchoolName, target.schoolName, 0.8 score = stringScore otherSchoolName, target.schoolName, 0.8
@ -173,7 +188,7 @@ findSuggestions = (target) ->
existingSuggestion.reasons.push reason existingSuggestion.reasons.push reason
else else
suggestions.push schoolName: match, reasons: [reason], user: otherUser suggestions.push schoolName: match, reasons: [reason], user: otherUser
console.log ' Now checking domain', (new Date()) - t0 if debugging then console.log ' Now checking domain', (new Date()) - t0
if domain = getDomain target if domain = getDomain target
for otherUser in (userCategories.domain[domain] ? []) when otherUser isnt target for otherUser in (userCategories.domain[domain] ? []) when otherUser isnt target
reason = "Domain" reason = "Domain"
@ -181,7 +196,7 @@ findSuggestions = (target) ->
existingSuggestion.reasons.push reason existingSuggestion.reasons.push reason
else else
suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser
console.log ' Now checking referrer', (new Date()) - t0 if debugging then console.log ' Now checking referrer', (new Date()) - t0
if referrer = getReferrer target if referrer = getReferrer target
for otherUser in (userCategories.referrer[referrer] ? []) when otherUser isnt target for otherUser in (userCategories.referrer[referrer] ? []) when otherUser isnt target
reason = "Referrer" reason = "Referrer"
@ -189,6 +204,7 @@ findSuggestions = (target) ->
existingSuggestion.reasons.push reason existingSuggestion.reasons.push reason
else else
suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser
if debugging then console.log ' Done checking referrer', (new Date()) - t0
suggestions = _.sortBy suggestions, 'schoolName' suggestions = _.sortBy suggestions, 'schoolName'
suggestions = _.sortBy suggestions, (s) -> -s.reasons.length suggestions = _.sortBy suggestions, (s) -> -s.reasons.length
return suggestions return suggestions
@ -226,20 +242,21 @@ typoCache = {}
getDomain = (user) -> getDomain = (user) ->
return null unless domain = user.emailLower.split('@')[1] return null unless domain = user.emailLower.split('@')[1]
return null if commonEmailDomainMap[domain] return null if commonEmailDomainMap[domain]
typo = typoCache[domain] # Too slow? Is this actually slow?
return null if typo #typo = typoCache[domain]
return domain if typo is false #return null if typo
typo = _.find commonEmailDomains, (commonDomain) -> stringScore(commonDomain, domain, 0.8) > 0.9 #return domain if typo is false
typoCache[domain] = Boolean(typo) #typo = _.find commonEmailDomains, (commonDomain) -> stringScore(commonDomain, domain, 0.8) > 0.9
return null if typo #typoCache[domain] = Boolean(typo)
#return null if typo
domain domain
commonReferrersRegex = /(google|bing\.|yahoo|duckduckgo|jobs\.lever|code\.org|twitter|facebook|dollarclick|stumbleupon|vk\.com|playpcesor|reddit|lifehacker|favorite|bnext|freelance|taringa|blogthinkbig|graphism|inside\.com|korben|habrahabr|iplaysoft|geekbrains|playground|ycombinator|github\.com)/ commonReferrersRegex = /(google|bing\.|yahoo|duckduckgo|jobs\.lever|code\.org|twitter|facebook|dollarclick|stumbleupon|vk\.com|playpcesor|reddit|lifehacker|favorite|bnext|freelance|taringa|blogthinkbig|graphism|inside\.com|korben|habrahabr|iplaysoft|geekbrains|playground|ycombinator|github)/
getReferrer = (user) -> getReferrer = (user) ->
return null unless referrer = user.referrer?.toLowerCase().trim() return null unless referrer = user.referrer?.toLowerCase().trim()
referrer = referrer.replace /^https?:\/\//, '' referrer = referrer.replace /^https?:\/\//, ''
return null if commonReferrersRegex.test referrer return null if commonReferrersRegex.test referrer
return classCode if classCode = referrer.match(/\?_cc=(\S+)/)?[1] return classCode if classCode = referrer.match(/\?_cc=(\S+)$/)?[1]
return null if /codecombat/.test referrer return null if /codecombat/.test referrer
referrer referrer
@ -316,7 +333,7 @@ commonEmailDomains = [
"virgin.net", "wanadoo.co.uk", "bt.com", "virgin.net", "wanadoo.co.uk", "bt.com",
# Domains used in Asia # Domains used in Asia
"sina.com", "qq.com", "naver.com", "hanmail.net", "daum.net", "nate.com", "yahoo.co.jp", "yahoo.co.kr", "yahoo.co.id", "yahoo.co.in", "yahoo.com.sg", "yahoo.com.ph", "sina.com", "qq.com", "naver.com", "hanmail.net", "daum.net", "nate.com", "yahoo.co.jp", "yahoo.co.kr", "yahoo.co.id", "yahoo.co.in", "yahoo.com.sg", "yahoo.com.ph", "yahoo.com.tw"
# French ISP domains # French ISP domains
"hotmail.fr", "live.fr", "laposte.net", "yahoo.fr", "wanadoo.fr", "orange.fr", "gmx.fr", "sfr.fr", "neuf.fr", "free.fr", "hotmail.fr", "live.fr", "laposte.net", "yahoo.fr", "wanadoo.fr", "orange.fr", "gmx.fr", "sfr.fr", "neuf.fr", "free.fr",
@ -337,4 +354,4 @@ commonEmailDomains = [
"hotmail.com", "gmail.com", "yahoo.com.mx", "live.com.mx", "yahoo.com", "hotmail.es", "live.com", "hotmail.com.mx", "prodigy.net.mx", "msn.com" "hotmail.com", "gmail.com", "yahoo.com.mx", "live.com.mx", "yahoo.com", "hotmail.es", "live.com", "hotmail.com.mx", "prodigy.net.mx", "msn.com"
] ]
commonEmailDomainMap = {} commonEmailDomainMap = {}
commonEmailDomainMap[domain] = true for domain in commonEmailDomainMap commonEmailDomainMap[domain] = true for domain in commonEmailDomains