2015-12-01 21:04:50 -05:00
# Organize our users' schoolNames.
database = require ' ../server/commons/database '
mongoose = require ' mongoose '
log = require ' winston '
async = require ' async '
2015-12-09 14:55:03 -05:00
moment = require ' moment '
2015-12-01 21:04:50 -05:00
### SET UP ###
do (setupLodash = this) ->
GLOBAL._ = require ' lodash '
_.str = require ' underscore.string '
_ . mixin _ . str . exports ( )
GLOBAL.tv4 = require ( ' tv4 ' ) . tv4
database . connect ( )
UserHandler = require ' ../server/users/user_handler '
User = require ' ../server/users/User '
2015-12-08 20:17:25 -05:00
startDate = new Date 2015 , 11 , 1
2015-12-01 21:04:50 -05:00
2015-12-09 14:55:03 -05:00
debugging = false
2015-12-01 21:04:50 -05:00
query = dateCreated: { $gt: startDate } , emailLower: { $exists: true }
2015-12-08 15:12:56 -05:00
selection = ' name emailLower schoolName courseInstances clans ageRange dateCreated referrer points lastIP hourOfCode preferredLanguage lastLevel '
2015-12-01 21:04:50 -05:00
User . find ( query ) . select ( selection ) . lean ( ) . exec (err, users) ->
usersWithSchools = _ . filter users , ' schoolName '
2015-12-08 17:42:33 -05:00
log . info " Found #{ usersWithSchools . length } users of #{ users . length } users registered after #{ startDate } . "
2015-12-01 21:04:50 -05:00
nextPrompt users
2015-12-09 14:55:03 -05:00
nextPrompt = (users, question, userToSchool, suggestions) ->
2015-12-08 17:42:33 -05:00
# We look for the next top user to classify based on the number of suggestions we can make about what the school name should be.
2015-12-08 15:12:56 -05:00
sortUsers users
2015-12-09 14:55:03 -05:00
unless userToSchool
return console . log ( ' Done. ' ) or process . exit ( ) unless [ userToSchool , suggestions ] = findUserToSchool users
2015-12-08 15:12:56 -05:00
question ? = formatSuggestions userToSchool , suggestions
prompt question , (answer) ->
answer = answer . trim ( )
2015-12-08 20:17:25 -05:00
return console . log ( ' Bye. ' ) or process . exit ( ) if answer in [ ' q ' , ' quit ' ]
2015-12-08 15:12:56 -05:00
if answer is ' '
2015-12-08 20:17:25 -05:00
return nextPrompt _ . without users , userToSchool
2015-12-08 15:12:56 -05:00
else unless _ . isNaN ( num = parseInt ( answer , 10 ) )
schoolName = if num then suggestions [ num - 1 ] ? . schoolName else userToSchool . schoolName
return finalizePrompt userToSchool , suggestions , schoolName , users
else if answer . length < 10
console . log " #{ answer } ? That ' s kind of short--I don ' t think school names and locations can be this short. What should it really be? "
2015-12-09 14:55:03 -05:00
return nextPrompt users , " > " , userToSchool , suggestions
else unless /,.+,/ . test answer
console . log " #{ answer } ? We need the full location (with two commas), like Example High School, Springfield, IL. What should it really be? "
return nextPrompt users , " > " , userToSchool , suggestions
2015-12-08 15:12:56 -05:00
else
return finalizePrompt userToSchool , suggestions , answer , users
2015-12-01 21:04:50 -05:00
2015-12-08 15:12:56 -05:00
finalizePrompt = (userToSchool, suggestions, schoolName, users) ->
console . log " Selected schoolName: \" #{ schoolName } \" "
2015-12-08 20:17:25 -05:00
question = " Also apply this to other users? Ex.: ' all ' , ' 0 1 2 5 9-14 ' , ' all but 38 59-65 ' , ' 0 ' to just do this one, q to quit, or blank to retype school name. \n > "
2015-12-08 15:12:56 -05:00
prompt question , (answer) ->
answer = answer . trim ( )
2015-12-08 20:17:25 -05:00
return console . log ( ' Bye. ' ) or process . exit ( ) if answer in [ ' q ' , ' quit ' ]
2015-12-08 15:12:56 -05:00
if answer is ' '
console . log " Should just do " , userToSchool . _id , userToSchool . emailLower , userToSchool . schoolName
targets = [ userToSchool ]
else if answer is ' all '
targets = [ userToSchool ] . concat ( s . user for s in suggestions )
console . log " Doing all #{ targets . length } users... "
else if /^all/ . test answer
2015-12-08 17:42:33 -05:00
numbers = findNumbers answer , suggestions . length
2015-12-08 15:12:56 -05:00
targets = [ userToSchool ] . concat ( s . user for s in suggestions )
for number in numbers
skip = if number then suggestions [ number - 1 ] . user else userToSchool
targets = _ . without targets , skip
console . log " Doing all #{ targets . length } users without #{ numbers } ... "
else
2015-12-08 17:42:33 -05:00
numbers = findNumbers answer , suggestions . length
targets = _ . filter ( ( if number then suggestions [ number - 1 ] . user else userToSchool ) for number in numbers )
2015-12-08 15:12:56 -05:00
console . log " Doing #{ targets . length } users for #{ numbers } ... "
2015-12-08 20:17:25 -05:00
User . update { _id: { $in: ( _ . map targets , ' _id ' ) } } , { schoolName: schoolName } , { multi: true } , (err, result) ->
#User.update {_id: {$in: []}}, {schoolName: schoolName}, {multi: true}, (err, result) ->
2015-12-08 15:12:56 -05:00
if err
console . error " Ran into error doing the save: " , err
return finalizePrompt userToSchool , suggestions , schoolName , users
console . log " Updated users ' schoolNames. Result: " , result
2015-12-08 17:42:33 -05:00
# Take these users out of the pool to make suggestions about before going on to next suggestions.
2015-12-08 15:12:56 -05:00
remainingUsers = _ . without users , targets . . .
nextPrompt remainingUsers
2015-12-08 17:42:33 -05:00
findNumbers = (answer, max) ->
2015-12-09 14:55:03 -05:00
numbers = ( parseInt ( d , 10 ) for d in ( ' ' + answer + ' ' ) . replace ( / /g, ' ').match(/ ( \ d + ) / g ) ? [ ] )
2015-12-08 17:42:33 -05:00
ranges = answer . match ( /(\d+-\d+)/g ) or [ ]
for range in ranges
bounds = ( parseInt ( d , 10 ) for d in range . split ( ' - ' ) )
for number in [ bounds [ 0 ] . . bounds [ 1 ] ]
numbers . push number
for number in numbers
if number > max
console . log " Incorrect number #{ number } higher than max: #{ max } "
numbers
2015-12-09 14:55:03 -05:00
formatUser = (user, relativeToUser) ->
2015-12-08 20:17:25 -05:00
# TODO: replace date string with relative time since signup compared to target user, and actually make suggestions based on students that signed up at almost the same time
2015-12-09 14:55:03 -05:00
props = _ . pick ( user , [ ' name ' , ' emailLower ' , ' ageRange ' , ' dateCreated ' , ' lastLevel ' , ' points ' , ' referrer ' , ' hourOfCode ' ] )
props.dateCreated = if relativeToUser then moment ( props . dateCreated ) . from ( relativeToUser . dateCreated ) else moment ( props . dateCreated ) . fromNow ( )
_ . values ( props ) . join ( ' ' )
2015-12-08 15:12:56 -05:00
formatSuggestions = (userToSchool, suggestions) ->
2015-12-09 14:55:03 -05:00
suggestionPrompts = ( " #{ _ . str . rpad ( i + 1 , 3 ) } #{ _ . str . rpad ( s . schoolName , 50 ) } #{ s . reasons . length } #{ if s . reasons . length > 1 then ' Matches ' else ' Match ' } : #{ s . reasons . join ( ' , ' ) } \t from user: #{ formatUser ( s . user , userToSchool ) } " for s , i in suggestions ) . join ( ' \n ' )
2015-12-08 15:12:56 -05:00
"""
What should the school for this user be ?
0 #{_.str.rpad(userToSchool.schoolName, 50)} #{formatUser(userToSchool)}
Suggestions:
#{suggestionPrompts}
Choose a number , type a name , enter to skip , or q to quit .
> """
2015-12-09 14:55:03 -05:00
checkedTopGroups = { }
2015-12-01 21:04:50 -05:00
findUserToSchool = (users) ->
2015-12-08 17:42:33 -05:00
# We find the top user from the top group that we can make the most reasoned suggestions about what the school name would be.
2015-12-09 14:55:03 -05:00
[ bestTarget , bestTargetSuggestions , mostReasons , bestGroup ] = [ null , [ ] , 0 , null ]
2015-12-08 15:12:56 -05:00
for field , groups of topGroups
2015-12-09 14:55:03 -05:00
for nextLargestGroup in groups when not checkedTopGroups [ nextLargestGroup ]
2015-12-08 20:17:25 -05:00
possibleTargets = userCategories [ field ] [ nextLargestGroup ]
2015-12-09 14:55:03 -05:00
schoolNames = ( t . schoolName for t in _ . uniq possibleTargets , ' schoolName ' )
2015-12-08 20:17:25 -05:00
# TODO: better method to avoid showing users where everyone in the suggestion already has the same school (because we have already done this group)
2015-12-09 14:55:03 -05:00
alreadyDone = false
for schoolName in schoolNames when schoolName ? . length > 10 and /,.+,/ . test schoolName # Long enough school name with location info (two commas)
sharedCount = _ . filter ( possibleTargets , schoolName: schoolName ) . length
if sharedCount > 0.5 * possibleTargets . length
console . log ' Already done ' , schoolName , sharedCount , possibleTargets . length , ' for ' , field , nextLargestGroup
2015-12-08 20:17:25 -05:00
alreadyDone = true
continue if alreadyDone
nSamples = Math . min 15 , Math . max ( 4 , Math . floor possibleTargets . length / 20 )
2015-12-09 14:55:03 -05:00
if debugging then console . log ' Checking ' , nSamples , ' samples of ' , possibleTargets . length , ' players in the biggest ' , field , ' group: ' , nextLargestGroup
2015-12-08 20:17:25 -05:00
for i in [ 0 . . . nSamples ]
target = possibleTargets [ Math . floor i * possibleTargets . length / ( nSamples + 1 ) ]
suggestions = findSuggestions target
reasons = _ . reduce suggestions , ( (sum, suggestion) ->
for suggestion in suggestions
for reason in suggestion . reasons
sum += switch reason
when ' Course instances ' then 50
when ' IP ' then 40
when ' Name ' then 30
when ' Referrer ' then 20
when ' Domain ' then ( if getDomain ( target ) is ' cps.edu ' then 1 else 10 )
2015-12-09 14:55:03 -05:00
when ' Clans ' then 0.01
2015-12-08 20:17:25 -05:00
sum
) , 0
if reasons > mostReasons
bestTarget = target
bestTargetSuggestions = suggestions
mostReasons = reasons
2015-12-09 14:55:03 -05:00
bestGroup = nextLargestGroup
2015-12-08 20:17:25 -05:00
break
2015-12-09 14:55:03 -05:00
checkedTopGroups [ bestGroup ] = true
2015-12-08 15:12:56 -05:00
return [ bestTarget , bestTargetSuggestions ]
findSuggestions = (target) ->
2015-12-08 17:42:33 -05:00
# Look for other users with the same IP, course instances, clans, or similar school names or non-common shared email domains.
2015-12-08 15:12:56 -05:00
suggestions = [ ]
2015-12-08 20:17:25 -05:00
t0 = new Date ( )
2015-12-09 14:55:03 -05:00
if debugging then console . log ' Checking suggestions for ' , target . emailLower , target . schoolName , ( new Date ( ) ) - t0
2015-12-08 15:12:56 -05:00
if target . lastIP
2015-12-08 20:17:25 -05:00
for otherUser in ( userCategories . lastIP [ target . lastIP ] ? [ ] ) when otherUser isnt target
suggestions . push schoolName: otherUser . schoolName , reasons: [ ' IP ' ] , user: otherUser
2015-12-08 15:12:56 -05:00
for leagueType in [ ' courseInstances ' , ' clans ' ]
2015-12-09 14:55:03 -05:00
if debugging then console . log ' Now checking ' , leagueType , ( new Date ( ) ) - t0
2015-12-08 15:12:56 -05:00
if target [ leagueType ] ? . length
for league in target [ leagueType ]
2015-12-08 20:17:25 -05:00
for otherUser in ( userCategories [ leagueType ] [ league ] ? [ ] ) when otherUser isnt target
reason = _ . str . humanize ( leagueType )
2015-12-08 15:12:56 -05:00
if existingSuggestion = _ . find ( suggestions , user: otherUser )
existingSuggestion . reasons . push reason
else
suggestions . push schoolName: otherUser . schoolName , reasons: [ reason ] , user: otherUser
if target . schoolName ? . length > 5
2015-12-09 14:55:03 -05:00
if debugging then console . log ' Now checking schoolName ' , ( new Date ( ) ) - t0
2015-12-08 15:12:56 -05:00
nameMatches = [ ]
for otherSchoolName in topGroups . schoolName
score = stringScore otherSchoolName , target . schoolName , 0.8
continue if score < 0.25
nameMatches . push schoolName: otherSchoolName , score: score
nameMatches = ( match . schoolName for match in ( _ . sortBy nameMatches , (match) -> - match . score ) )
for match in nameMatches . slice ( 0 , 10 )
2015-12-08 20:17:25 -05:00
reason = " Name "
for otherUser in ( userCategories . schoolName [ match ] ? [ ] ) when otherUser isnt target
2015-12-08 15:12:56 -05:00
if existingSuggestion = _ . find ( suggestions , user: otherUser )
existingSuggestion . reasons . push reason
else
suggestions . push schoolName: match , reasons: [ reason ] , user: otherUser
2015-12-09 14:55:03 -05:00
if debugging then console . log ' Now checking domain ' , ( new Date ( ) ) - t0
2015-12-08 17:42:33 -05:00
if domain = getDomain target
2015-12-08 20:17:25 -05:00
for otherUser in ( userCategories . domain [ domain ] ? [ ] ) when otherUser isnt target
reason = " Domain "
2015-12-08 17:42:33 -05:00
if existingSuggestion = _ . find ( suggestions , user: otherUser )
existingSuggestion . reasons . push reason
else
suggestions . push schoolName: otherUser . schoolName , reasons: [ reason ] , user: otherUser
2015-12-09 14:55:03 -05:00
if debugging then console . log ' Now checking referrer ' , ( new Date ( ) ) - t0
2015-12-08 20:17:25 -05:00
if referrer = getReferrer target
for otherUser in ( userCategories . referrer [ referrer ] ? [ ] ) when otherUser isnt target
reason = " Referrer "
if existingSuggestion = _ . find ( suggestions , user: otherUser )
existingSuggestion . reasons . push reason
else
suggestions . push schoolName: otherUser . schoolName , reasons: [ reason ] , user: otherUser
2015-12-09 14:55:03 -05:00
if debugging then console . log ' Done checking referrer ' , ( new Date ( ) ) - t0
2015-12-08 20:17:25 -05:00
suggestions = _ . sortBy suggestions , ' schoolName '
suggestions = _ . sortBy suggestions , (s) -> - s . reasons . length
return suggestions
2015-12-08 15:12:56 -05:00
userCategories = { }
topGroups = { }
usersCategorized = { }
sortUsers = (users) ->
users = _ . sortBy users , (u) -> - u . points
users = _ . sortBy users , [ ' schoolName ' , ' lastIP ' ]
2015-12-08 20:17:25 -05:00
for field in [ ' courseInstances ' , ' lastIP ' , ' schoolName ' , ' domain ' , ' clans ' , ' referrer ' ]
2015-12-08 15:12:56 -05:00
userCategories [ field ] = categorizeUsers users , field
topGroups [ field ] = _ . sortBy _ . keys ( userCategories [ field ] ) , (key) -> - userCategories [ field ] [ key ] . length
topGroups [ field ] = ( group for group in topGroups [ field ] when 2 < userCategories [ field ] [ group ] . length < ( if field is ' clans ' then 30 else 5000 ) )
categorizeUsers = (users, field) ->
categories = { }
2015-12-08 17:42:33 -05:00
for user in users
if field is ' domain '
value = getDomain user
2015-12-08 20:17:25 -05:00
else if field is ' referrer '
value = getReferrer user
2015-12-08 17:42:33 -05:00
else
value = user [ field ]
continue unless value
2015-12-08 15:12:56 -05:00
values = if _ . isArray ( value ) then value else [ value ]
for value in values when value
2015-12-08 17:42:33 -05:00
continue if value . trim and not value = value . trim ( )
2015-12-08 15:12:56 -05:00
categories [ value ] ? = [ ]
categories [ value ] . push user
categories
2015-12-01 21:04:50 -05:00
2015-12-08 20:17:25 -05:00
typoCache = { }
2015-12-08 17:42:33 -05:00
getDomain = (user) ->
2015-12-08 20:17:25 -05:00
return null unless domain = user . emailLower . split ( ' @ ' ) [ 1 ]
2015-12-08 17:42:33 -05:00
return null if commonEmailDomainMap [ domain ]
2015-12-09 14:55:03 -05:00
# Too slow? Is this actually slow?
#typo = typoCache[domain]
#return null if typo
#return domain if typo is false
#typo = _.find commonEmailDomains, (commonDomain) -> stringScore(commonDomain, domain, 0.8) > 0.9
#typoCache[domain] = Boolean(typo)
#return null if typo
2015-12-08 17:42:33 -05:00
domain
2015-12-01 21:04:50 -05:00
2015-12-09 14:55:03 -05:00
commonReferrersRegex = /(google|bing\.|yahoo|duckduckgo|jobs\.lever|code\.org|twitter|facebook|dollarclick|stumbleupon|vk\.com|playpcesor|reddit|lifehacker|favorite|bnext|freelance|taringa|blogthinkbig|graphism|inside\.com|korben|habrahabr|iplaysoft|geekbrains|playground|ycombinator|github)/
2015-12-08 20:17:25 -05:00
getReferrer = (user) ->
return null unless referrer = user . referrer ? . toLowerCase ( ) . trim ( )
referrer = referrer . replace / ^ https ? : \ / \ / / , ' '
return null if commonReferrersRegex . test referrer
2015-12-09 14:55:03 -05:00
return classCode if classCode = referrer . match ( /\?_cc=(\S+)$/ ) ? [ 1 ]
2015-12-08 20:17:25 -05:00
return null if /codecombat/ . test referrer
referrer
2015-12-01 21:04:50 -05:00
# https://github.com/joshaven/string_score
stringScore = (_a, word, fuzziness) ->
return 1 if word is _a
return 0 if word is " "
runningScore = 0
string = _a
lString = string . toLowerCase ( )
strLength = string . length
lWord = word . toLowerCase ( )
wordLength = word . length
startAt = 0
fuzzies = 1
if fuzziness
fuzzyFactor = 1 - fuzziness
if fuzziness
for i in [ 0 . . . wordLength ]
idxOf = lString . indexOf lWord [ i ] , startAt
if idxOf is - 1
fuzzies += fuzzyFactor
else
if startAt is idxOf
charScore = 0.7
else
charScore = 0.1
charScore += 0.8 if string [ idxOf - 1 ] is ' '
charScore += 0.1 if string [ idxOf ] is word [ i ]
runningScore += charScore
startAt = idxOf + 1
else
for i in [ 0 . . . wordLength ]
idxOf = lString . indexOf lWord [ i ] , startAt
return 0 if idxOf is - 1
if startAt is idxOf
charScore = 0.7
else
charScore = 0.1
charScore += 0.8 if string [ idxOf - 1 ] is word [ i ]
runningScore += charScore
startAt = idxOf + 1
finalScore = 0.5 * ( runningScore / strLength + runningScore / wordLength ) / fuzzies
finalScore += 0.15 if lWord [ 0 ] is lString [ 0 ] and finalScore < 0.85
finalScore
prompt = (question, callback) ->
process . stdin . resume ( )
process . stdout . write question
process . stdin . once ' data ' , (data) ->
callback data . toString ( ) . trim ( )
2015-12-08 17:42:33 -05:00
# https://github.com/mailcheck/mailcheck/wiki/List-of-Popular-Domains
commonEmailDomains = [
# Default domains included
" aol.com " , " att.net " , " comcast.net " , " facebook.com " , " gmail.com " , " gmx.com " , " googlemail.com " ,
" google.com " , " hotmail.com " , " hotmail.co.uk " , " mac.com " , " me.com " , " mail.com " , " msn.com " ,
" live.com " , " sbcglobal.net " , " verizon.net " , " yahoo.com " , " yahoo.co.uk " ,
# Other global domains
" email.com " , " games.com " , " gmx.net " , " hush.com " , " hushmail.com " , " icloud.com " , " inbox.com " ,
" lavabit.com " , " love.com " , " outlook.com " , " pobox.com " , " rocketmail.com " ,
" safe-mail.net " , " wow.com " , " ygm.com " , " ymail.com " , " zoho.com " , " fastmail.fm " ,
# United States ISP domains
" bellsouth.net " , " charter.net " , " comcast.net " , " cox.net " , " earthlink.net " , " juno.com " ,
# British ISP domains
" btinternet.com " , " virginmedia.com " , " blueyonder.co.uk " , " freeserve.co.uk " , " live.co.uk " ,
" ntlworld.com " , " o2.co.uk " , " orange.net " , " sky.com " , " talktalk.co.uk " , " tiscali.co.uk " ,
" virgin.net " , " wanadoo.co.uk " , " bt.com " ,
# Domains used in Asia
2015-12-09 14:55:03 -05:00
" sina.com " , " qq.com " , " naver.com " , " hanmail.net " , " daum.net " , " nate.com " , " yahoo.co.jp " , " yahoo.co.kr " , " yahoo.co.id " , " yahoo.co.in " , " yahoo.com.sg " , " yahoo.com.ph " , " yahoo.com.tw "
2015-12-08 17:42:33 -05:00
# French ISP domains
" hotmail.fr " , " live.fr " , " laposte.net " , " yahoo.fr " , " wanadoo.fr " , " orange.fr " , " gmx.fr " , " sfr.fr " , " neuf.fr " , " free.fr " ,
# German ISP domains
" gmx.de " , " hotmail.de " , " live.de " , " online.de " , " t-online.de " , " web.de " , " yahoo.de " ,
# Russian ISP domains
" mail.ru " , " rambler.ru " , " yandex.ru " , " ya.ru " , " list.ru " ,
# Belgian ISP domains
" hotmail.be " , " live.be " , " skynet.be " , " voo.be " , " tvcablenet.be " , " telenet.be " ,
# Argentinian ISP domains
" hotmail.com.ar " , " live.com.ar " , " yahoo.com.ar " , " fibertel.com.ar " , " speedy.com.ar " , " arnet.com.ar " ,
# Domains used in Mexico
" hotmail.com " , " gmail.com " , " yahoo.com.mx " , " live.com.mx " , " yahoo.com " , " hotmail.es " , " live.com " , " hotmail.com.mx " , " prodigy.net.mx " , " msn.com "
]
commonEmailDomainMap = { }
2015-12-09 14:55:03 -05:00
commonEmailDomainMap [ domain ] = true for domain in commonEmailDomains