2015-12-01 21:04:50 -05:00
# Organize our users' schoolNames.
database = require ' ../server/commons/database '
mongoose = require ' mongoose '
log = require ' winston '
async = require ' async '
### SET UP ###
do (setupLodash = this) ->
GLOBAL._ = require ' lodash '
_.str = require ' underscore.string '
_ . mixin _ . str . exports ( )
GLOBAL.tv4 = require ( ' tv4 ' ) . tv4
database . connect ( )
UserHandler = require ' ../server/users/user_handler '
User = require ' ../server/users/User '
2015-12-08 15:12:56 -05:00
#startDate = new Date 2015, 11, 1
startDate = new Date 2015 , 11 , 8 # Testing
2015-12-01 21:04:50 -05:00
query = dateCreated: { $gt: startDate } , emailLower: { $exists: true }
2015-12-08 15:12:56 -05:00
selection = ' name emailLower schoolName courseInstances clans ageRange dateCreated referrer points lastIP hourOfCode preferredLanguage lastLevel '
2015-12-01 21:04:50 -05:00
User . find ( query ) . select ( selection ) . lean ( ) . exec (err, users) ->
usersWithSchools = _ . filter users , ' schoolName '
2015-12-08 17:42:33 -05:00
log . info " Found #{ usersWithSchools . length } users of #{ users . length } users registered after #{ startDate } . "
2015-12-01 21:04:50 -05:00
nextPrompt users
2015-12-08 15:12:56 -05:00
nextPrompt = (users, question) ->
2015-12-08 17:42:33 -05:00
# We look for the next top user to classify based on the number of suggestions we can make about what the school name should be.
2015-12-08 15:12:56 -05:00
sortUsers users
2015-12-01 21:04:50 -05:00
return console . log ( ' Done. ' ) or process . exit ( ) unless [ userToSchool , suggestions ] = findUserToSchool users
2015-12-08 15:12:56 -05:00
question ? = formatSuggestions userToSchool , suggestions
prompt question , (answer) ->
2015-12-01 21:04:50 -05:00
return console . log ( ' Bye. ' ) or process . exit ( ) if answer in [ ' q ' , ' quit ' ]
2015-12-08 15:12:56 -05:00
answer = answer . trim ( )
if answer is ' '
users = _ . without users , userToSchool
else unless _ . isNaN ( num = parseInt ( answer , 10 ) )
schoolName = if num then suggestions [ num - 1 ] ? . schoolName else userToSchool . schoolName
return finalizePrompt userToSchool , suggestions , schoolName , users
else if answer . length < 10
console . log " #{ answer } ? That ' s kind of short--I don ' t think school names and locations can be this short. What should it really be? "
return nextPrompt users , " > "
else
return finalizePrompt userToSchool , suggestions , answer , users
2015-12-01 21:04:50 -05:00
nextPrompt users
2015-12-08 15:12:56 -05:00
finalizePrompt = (userToSchool, suggestions, schoolName, users) ->
console . log " Selected schoolName: \" #{ schoolName } \" "
2015-12-08 17:42:33 -05:00
question = " Also apply this to other users? Ex.: ' all ' , ' 0 1 2 5 9-14 ' , ' all but 38 59-65 ' , ' 0 ' to just do this one, or blank to retype school name. \n > "
2015-12-08 15:12:56 -05:00
prompt question , (answer) ->
answer = answer . trim ( )
if answer is ' '
console . log " Should just do " , userToSchool . _id , userToSchool . emailLower , userToSchool . schoolName
targets = [ userToSchool ]
else if answer is ' all '
targets = [ userToSchool ] . concat ( s . user for s in suggestions )
console . log " Doing all #{ targets . length } users... "
else if /^all/ . test answer
2015-12-08 17:42:33 -05:00
numbers = findNumbers answer , suggestions . length
2015-12-08 15:12:56 -05:00
targets = [ userToSchool ] . concat ( s . user for s in suggestions )
for number in numbers
skip = if number then suggestions [ number - 1 ] . user else userToSchool
targets = _ . without targets , skip
console . log " Doing all #{ targets . length } users without #{ numbers } ... "
else
2015-12-08 17:42:33 -05:00
numbers = findNumbers answer , suggestions . length
targets = _ . filter ( ( if number then suggestions [ number - 1 ] . user else userToSchool ) for number in numbers )
2015-12-08 15:12:56 -05:00
console . log " Doing #{ targets . length } users for #{ numbers } ... "
#User.update {_id: {$in: (_.map targets, '_id')}}, {schoolName: schoolName}, {multi: true}, (err, result) ->
User . update { _id: { $in: [ ] } } , { schoolName: schoolName } , { multi: true } , (err, result) ->
if err
console . error " Ran into error doing the save: " , err
return finalizePrompt userToSchool , suggestions , schoolName , users
console . log " Updated users ' schoolNames. Result: " , result
2015-12-08 17:42:33 -05:00
# Take these users out of the pool to make suggestions about before going on to next suggestions.
2015-12-08 15:12:56 -05:00
remainingUsers = _ . without users , targets . . .
nextPrompt remainingUsers
2015-12-08 17:42:33 -05:00
findNumbers = (answer, max) ->
numbers = ( parseInt ( d , 10 ) for d in ( ' ' + answer + ' ' ) . match ( / ( \ d + ) / g ) ? [ ] )
ranges = answer . match ( /(\d+-\d+)/g ) or [ ]
for range in ranges
bounds = ( parseInt ( d , 10 ) for d in range . split ( ' - ' ) )
for number in [ bounds [ 0 ] . . bounds [ 1 ] ]
numbers . push number
for number in numbers
if number > max
console . log " Incorrect number #{ number } higher than max: #{ max } "
numbers
2015-12-08 15:12:56 -05:00
formatUser = (user) ->
# TODO: replace date string with relative time since signup compared to target user
_ . values ( _ . pick ( user , [ ' name ' , ' emailLower ' , ' ageRange ' , ' dateCreated ' , ' lastLevel ' , ' points ' , ' referrer ' , ' hourOfCode ' ] ) ) . join ( ' ' )
formatSuggestions = (userToSchool, suggestions) ->
suggestionPrompts = ( " #{ _ . str . rpad ( i + 1 , 3 ) } #{ _ . str . rpad ( s . schoolName , 50 ) } #{ s . reasons . join ( ' + ' ) } \t from user: #{ formatUser ( s . user ) } " for s , i in suggestions ) . join ( ' \n ' )
"""
What should the school for this user be ?
0 #{_.str.rpad(userToSchool.schoolName, 50)} #{formatUser(userToSchool)}
Suggestions:
#{suggestionPrompts}
Choose a number , type a name , enter to skip , or q to quit .
> """
2015-12-01 21:04:50 -05:00
findUserToSchool = (users) ->
2015-12-08 17:42:33 -05:00
# We find the top user from the top group that we can make the most reasoned suggestions about what the school name would be.
2015-12-08 15:12:56 -05:00
# TODO: don't show users where everyone in the suggestion already has the same school (because we have already done this group)
[ bestTarget , bestTargetSuggestions , mostReasons ] = [ null , [ ] , 0 ]
for field , groups of topGroups
largestGroup = groups [ 0 ]
target = userCategories [ field ] [ largestGroup ] [ 0 ]
suggestions = findSuggestions target
reasons = _ . reduce suggestions , ( (sum, suggestion) -> sum + ( if suggestion . schoolName then suggestion . reasons . length else 0 ) ) , 0
if reasons > mostReasons
bestTarget = target
bestTargetSuggestions = suggestions
mostReasons = reasons
return [ bestTarget , bestTargetSuggestions ]
findSuggestions = (target) ->
2015-12-08 17:42:33 -05:00
# Look for other users with the same IP, course instances, clans, or similar school names or non-common shared email domains.
2015-12-08 15:12:56 -05:00
suggestions = [ ]
if target . lastIP
for otherUser in userCategories . lastIP [ target . lastIP ] when otherUser isnt target
suggestions . push schoolName: otherUser . schoolName , reasons: [ " IP match " ] , user: otherUser
for leagueType in [ ' courseInstances ' , ' clans ' ]
if target [ leagueType ] ? . length
for league in target [ leagueType ]
for otherUser in userCategories [ leagueType ] [ league ] when otherUser isnt target
reason = " #{ _ . str . humanize ( leagueType ) } match "
if existingSuggestion = _ . find ( suggestions , user: otherUser )
existingSuggestion . reasons . push reason
else
suggestions . push schoolName: otherUser . schoolName , reasons: [ reason ] , user: otherUser
if target . schoolName ? . length > 5
nameMatches = [ ]
for otherSchoolName in topGroups . schoolName
score = stringScore otherSchoolName , target . schoolName , 0.8
continue if score < 0.25
nameMatches . push schoolName: otherSchoolName , score: score
nameMatches = ( match . schoolName for match in ( _ . sortBy nameMatches , (match) -> - match . score ) )
for match in nameMatches . slice ( 0 , 10 )
reason = " Name match "
for otherUser in userCategories . schoolName [ match ] when otherUser isnt target
if existingSuggestion = _ . find ( suggestions , user: otherUser )
existingSuggestion . reasons . push reason
else
suggestions . push schoolName: match , reasons: [ reason ] , user: otherUser
2015-12-08 17:42:33 -05:00
if domain = getDomain target
for otherUser in userCategories . domain [ domain ] when otherUser isnt target
reason = " Domain match "
if existingSuggestion = _ . find ( suggestions , user: otherUser )
existingSuggestion . reasons . push reason
else
suggestions . push schoolName: otherUser . schoolName , reasons: [ reason ] , user: otherUser
2015-12-08 15:12:56 -05:00
return _ . uniq suggestions , ' user '
userCategories = { }
topGroups = { }
usersCategorized = { }
sortUsers = (users) ->
users = _ . sortBy users , (u) -> - u . points
users = _ . sortBy users , [ ' schoolName ' , ' lastIP ' ]
2015-12-08 17:42:33 -05:00
for field in [ ' courseInstances ' , ' lastIP ' , ' schoolName ' , ' domain ' , ' clans ' ]
2015-12-08 15:12:56 -05:00
userCategories [ field ] = categorizeUsers users , field
topGroups [ field ] = _ . sortBy _ . keys ( userCategories [ field ] ) , (key) -> - userCategories [ field ] [ key ] . length
topGroups [ field ] = ( group for group in topGroups [ field ] when 2 < userCategories [ field ] [ group ] . length < ( if field is ' clans ' then 30 else 5000 ) )
categorizeUsers = (users, field) ->
categories = { }
2015-12-08 17:42:33 -05:00
for user in users
if field is ' domain '
value = getDomain user
else
value = user [ field ]
continue unless value
2015-12-08 15:12:56 -05:00
values = if _ . isArray ( value ) then value else [ value ]
for value in values when value
2015-12-08 17:42:33 -05:00
continue if value . trim and not value = value . trim ( )
2015-12-08 15:12:56 -05:00
categories [ value ] ? = [ ]
categories [ value ] . push user
categories
2015-12-01 21:04:50 -05:00
2015-12-08 17:42:33 -05:00
getDomain = (user) ->
domain = user . emailLower . split ( ' @ ' ) [ 1 ]
return null if commonEmailDomainMap [ domain ]
typo = _ . find commonEmailDomains , (commonDomain) -> stringScore ( commonDomain , domain , 0.8 ) > 0.9
return null if typo
domain
2015-12-01 21:04:50 -05:00
# https://github.com/joshaven/string_score
stringScore = (_a, word, fuzziness) ->
return 1 if word is _a
return 0 if word is " "
runningScore = 0
string = _a
lString = string . toLowerCase ( )
strLength = string . length
lWord = word . toLowerCase ( )
wordLength = word . length
startAt = 0
fuzzies = 1
if fuzziness
fuzzyFactor = 1 - fuzziness
if fuzziness
for i in [ 0 . . . wordLength ]
idxOf = lString . indexOf lWord [ i ] , startAt
if idxOf is - 1
fuzzies += fuzzyFactor
else
if startAt is idxOf
charScore = 0.7
else
charScore = 0.1
charScore += 0.8 if string [ idxOf - 1 ] is ' '
charScore += 0.1 if string [ idxOf ] is word [ i ]
runningScore += charScore
startAt = idxOf + 1
else
for i in [ 0 . . . wordLength ]
idxOf = lString . indexOf lWord [ i ] , startAt
return 0 if idxOf is - 1
if startAt is idxOf
charScore = 0.7
else
charScore = 0.1
charScore += 0.8 if string [ idxOf - 1 ] is word [ i ]
runningScore += charScore
startAt = idxOf + 1
finalScore = 0.5 * ( runningScore / strLength + runningScore / wordLength ) / fuzzies
finalScore += 0.15 if lWord [ 0 ] is lString [ 0 ] and finalScore < 0.85
finalScore
prompt = (question, callback) ->
process . stdin . resume ( )
process . stdout . write question
process . stdin . once ' data ' , (data) ->
callback data . toString ( ) . trim ( )
2015-12-08 17:42:33 -05:00
# https://github.com/mailcheck/mailcheck/wiki/List-of-Popular-Domains
commonEmailDomains = [
# Default domains included
" aol.com " , " att.net " , " comcast.net " , " facebook.com " , " gmail.com " , " gmx.com " , " googlemail.com " ,
" google.com " , " hotmail.com " , " hotmail.co.uk " , " mac.com " , " me.com " , " mail.com " , " msn.com " ,
" live.com " , " sbcglobal.net " , " verizon.net " , " yahoo.com " , " yahoo.co.uk " ,
# Other global domains
" email.com " , " games.com " , " gmx.net " , " hush.com " , " hushmail.com " , " icloud.com " , " inbox.com " ,
" lavabit.com " , " love.com " , " outlook.com " , " pobox.com " , " rocketmail.com " ,
" safe-mail.net " , " wow.com " , " ygm.com " , " ymail.com " , " zoho.com " , " fastmail.fm " ,
# United States ISP domains
" bellsouth.net " , " charter.net " , " comcast.net " , " cox.net " , " earthlink.net " , " juno.com " ,
# British ISP domains
" btinternet.com " , " virginmedia.com " , " blueyonder.co.uk " , " freeserve.co.uk " , " live.co.uk " ,
" ntlworld.com " , " o2.co.uk " , " orange.net " , " sky.com " , " talktalk.co.uk " , " tiscali.co.uk " ,
" virgin.net " , " wanadoo.co.uk " , " bt.com " ,
# Domains used in Asia
" sina.com " , " qq.com " , " naver.com " , " hanmail.net " , " daum.net " , " nate.com " , " yahoo.co.jp " , " yahoo.co.kr " , " yahoo.co.id " , " yahoo.co.in " , " yahoo.com.sg " , " yahoo.com.ph " ,
# French ISP domains
" hotmail.fr " , " live.fr " , " laposte.net " , " yahoo.fr " , " wanadoo.fr " , " orange.fr " , " gmx.fr " , " sfr.fr " , " neuf.fr " , " free.fr " ,
# German ISP domains
" gmx.de " , " hotmail.de " , " live.de " , " online.de " , " t-online.de " , " web.de " , " yahoo.de " ,
# Russian ISP domains
" mail.ru " , " rambler.ru " , " yandex.ru " , " ya.ru " , " list.ru " ,
# Belgian ISP domains
" hotmail.be " , " live.be " , " skynet.be " , " voo.be " , " tvcablenet.be " , " telenet.be " ,
# Argentinian ISP domains
" hotmail.com.ar " , " live.com.ar " , " yahoo.com.ar " , " fibertel.com.ar " , " speedy.com.ar " , " arnet.com.ar " ,
# Domains used in Mexico
" hotmail.com " , " gmail.com " , " yahoo.com.mx " , " live.com.mx " , " yahoo.com " , " hotmail.es " , " live.com " , " hotmail.com.mx " , " prodigy.net.mx " , " msn.com "
]
commonEmailDomainMap = { }
commonEmailDomainMap [ domain ] = true for domain in commonEmailDomainMap