mirror of
https://github.com/codeninjasllc/codecombat.git
synced 2024-11-30 10:56:53 -05:00
buildSchoolGraph script working well
This commit is contained in:
parent
7076a23fe2
commit
0539f796a7
1 changed files with 46 additions and 29 deletions
|
@ -4,6 +4,7 @@ database = require '../server/commons/database'
|
||||||
mongoose = require 'mongoose'
|
mongoose = require 'mongoose'
|
||||||
log = require 'winston'
|
log = require 'winston'
|
||||||
async = require 'async'
|
async = require 'async'
|
||||||
|
moment = require 'moment'
|
||||||
|
|
||||||
### SET UP ###
|
### SET UP ###
|
||||||
do (setupLodash = this) ->
|
do (setupLodash = this) ->
|
||||||
|
@ -19,6 +20,8 @@ User = require '../server/users/User'
|
||||||
|
|
||||||
startDate = new Date 2015, 11, 1
|
startDate = new Date 2015, 11, 1
|
||||||
|
|
||||||
|
debugging = false
|
||||||
|
|
||||||
query = dateCreated: {$gt: startDate}, emailLower: {$exists: true}
|
query = dateCreated: {$gt: startDate}, emailLower: {$exists: true}
|
||||||
selection = 'name emailLower schoolName courseInstances clans ageRange dateCreated referrer points lastIP hourOfCode preferredLanguage lastLevel'
|
selection = 'name emailLower schoolName courseInstances clans ageRange dateCreated referrer points lastIP hourOfCode preferredLanguage lastLevel'
|
||||||
User.find(query).select(selection).lean().exec (err, users) ->
|
User.find(query).select(selection).lean().exec (err, users) ->
|
||||||
|
@ -26,9 +29,10 @@ User.find(query).select(selection).lean().exec (err, users) ->
|
||||||
log.info "Found #{usersWithSchools.length} users of #{users.length} users registered after #{startDate}."
|
log.info "Found #{usersWithSchools.length} users of #{users.length} users registered after #{startDate}."
|
||||||
nextPrompt users
|
nextPrompt users
|
||||||
|
|
||||||
nextPrompt = (users, question) ->
|
nextPrompt = (users, question, userToSchool, suggestions) ->
|
||||||
# We look for the next top user to classify based on the number of suggestions we can make about what the school name should be.
|
# We look for the next top user to classify based on the number of suggestions we can make about what the school name should be.
|
||||||
sortUsers users
|
sortUsers users
|
||||||
|
unless userToSchool
|
||||||
return console.log('Done.') or process.exit() unless [userToSchool, suggestions] = findUserToSchool users
|
return console.log('Done.') or process.exit() unless [userToSchool, suggestions] = findUserToSchool users
|
||||||
question ?= formatSuggestions userToSchool, suggestions
|
question ?= formatSuggestions userToSchool, suggestions
|
||||||
prompt question, (answer) ->
|
prompt question, (answer) ->
|
||||||
|
@ -41,7 +45,10 @@ nextPrompt = (users, question) ->
|
||||||
return finalizePrompt userToSchool, suggestions, schoolName, users
|
return finalizePrompt userToSchool, suggestions, schoolName, users
|
||||||
else if answer.length < 10
|
else if answer.length < 10
|
||||||
console.log "#{answer}? That's kind of short--I don't think school names and locations can be this short. What should it really be?"
|
console.log "#{answer}? That's kind of short--I don't think school names and locations can be this short. What should it really be?"
|
||||||
return nextPrompt users, "> "
|
return nextPrompt users, "> ", userToSchool, suggestions
|
||||||
|
else unless /,.+,/.test answer
|
||||||
|
console.log "#{answer}? We need the full location (with two commas), like Example High School, Springfield, IL. What should it really be?"
|
||||||
|
return nextPrompt users, "> ", userToSchool, suggestions
|
||||||
else
|
else
|
||||||
return finalizePrompt userToSchool, suggestions, answer, users
|
return finalizePrompt userToSchool, suggestions, answer, users
|
||||||
|
|
||||||
|
@ -79,7 +86,7 @@ finalizePrompt = (userToSchool, suggestions, schoolName, users) ->
|
||||||
nextPrompt remainingUsers
|
nextPrompt remainingUsers
|
||||||
|
|
||||||
findNumbers = (answer, max) ->
|
findNumbers = (answer, max) ->
|
||||||
numbers = (parseInt(d, 10) for d in (' ' + answer + ' ').match(/ (\d+) /g) ? [])
|
numbers = (parseInt(d, 10) for d in (' ' + answer + ' ').replace(/ /g, ' ').match(/ (\d+) /g) ? [])
|
||||||
ranges = answer.match(/(\d+-\d+)/g) or []
|
ranges = answer.match(/(\d+-\d+)/g) or []
|
||||||
for range in ranges
|
for range in ranges
|
||||||
bounds = (parseInt(d, 10) for d in range.split('-'))
|
bounds = (parseInt(d, 10) for d in range.split('-'))
|
||||||
|
@ -90,12 +97,14 @@ findNumbers = (answer, max) ->
|
||||||
console.log "Incorrect number #{number} higher than max: #{max}"
|
console.log "Incorrect number #{number} higher than max: #{max}"
|
||||||
numbers
|
numbers
|
||||||
|
|
||||||
formatUser = (user) ->
|
formatUser = (user, relativeToUser) ->
|
||||||
# TODO: replace date string with relative time since signup compared to target user, and actually make suggestions based on students that signed up at almost the same time
|
# TODO: replace date string with relative time since signup compared to target user, and actually make suggestions based on students that signed up at almost the same time
|
||||||
_.values(_.pick(user, ['name', 'emailLower', 'ageRange', 'dateCreated', 'lastLevel', 'points', 'referrer', 'hourOfCode'])).join(' ')
|
props = _.pick(user, ['name', 'emailLower', 'ageRange', 'dateCreated', 'lastLevel', 'points', 'referrer', 'hourOfCode'])
|
||||||
|
props.dateCreated = if relativeToUser then moment(props.dateCreated).from(relativeToUser.dateCreated) else moment(props.dateCreated).fromNow()
|
||||||
|
_.values(props).join(' ')
|
||||||
|
|
||||||
formatSuggestions = (userToSchool, suggestions) ->
|
formatSuggestions = (userToSchool, suggestions) ->
|
||||||
suggestionPrompts = ("#{_.str.rpad(i + 1, 3)} #{_.str.rpad(s.schoolName, 50)} #{s.reasons.length} #{if s.reasons.length > 1 then 'Matches' else 'Match'}: #{s.reasons.join(', ')}\tfrom user: #{formatUser(s.user)}" for s, i in suggestions).join('\n')
|
suggestionPrompts = ("#{_.str.rpad(i + 1, 3)} #{_.str.rpad(s.schoolName, 50)} #{s.reasons.length} #{if s.reasons.length > 1 then 'Matches' else 'Match'}: #{s.reasons.join(', ')}\tfrom user: #{formatUser(s.user, userToSchool)}" for s, i in suggestions).join('\n')
|
||||||
"""
|
"""
|
||||||
What should the school for this user be?
|
What should the school for this user be?
|
||||||
0 #{_.str.rpad(userToSchool.schoolName, 50)} #{formatUser(userToSchool)}
|
0 #{_.str.rpad(userToSchool.schoolName, 50)} #{formatUser(userToSchool)}
|
||||||
|
@ -104,20 +113,24 @@ formatSuggestions = (userToSchool, suggestions) ->
|
||||||
Choose a number, type a name, enter to skip, or q to quit.
|
Choose a number, type a name, enter to skip, or q to quit.
|
||||||
> """
|
> """
|
||||||
|
|
||||||
|
checkedTopGroups = {}
|
||||||
findUserToSchool = (users) ->
|
findUserToSchool = (users) ->
|
||||||
# We find the top user from the top group that we can make the most reasoned suggestions about what the school name would be.
|
# We find the top user from the top group that we can make the most reasoned suggestions about what the school name would be.
|
||||||
[bestTarget, bestTargetSuggestions, mostReasons] = [null, [], 0]
|
[bestTarget, bestTargetSuggestions, mostReasons, bestGroup] = [null, [], 0, null]
|
||||||
for field, groups of topGroups
|
for field, groups of topGroups
|
||||||
for nextLargestGroup in groups
|
for nextLargestGroup in groups when not checkedTopGroups[nextLargestGroup]
|
||||||
possibleTargets = userCategories[field][nextLargestGroup]
|
possibleTargets = userCategories[field][nextLargestGroup]
|
||||||
schoolNames = _.uniq possibleTargets, 'schoolName'
|
schoolNames = (t.schoolName for t in _.uniq possibleTargets, 'schoolName')
|
||||||
# TODO: better method to avoid showing users where everyone in the suggestion already has the same school (because we have already done this group)
|
# TODO: better method to avoid showing users where everyone in the suggestion already has the same school (because we have already done this group)
|
||||||
for schoolName in schoolNames
|
alreadyDone = false
|
||||||
if _.filter(possibleTargets, schoolName: schoolName).length > 0.5 * possibleTargets.length
|
for schoolName in schoolNames when schoolName?.length > 10 and /,.+,/.test schoolName # Long enough school name with location info (two commas)
|
||||||
|
sharedCount = _.filter(possibleTargets, schoolName: schoolName).length
|
||||||
|
if sharedCount > 0.5 * possibleTargets.length
|
||||||
|
console.log 'Already done', schoolName, sharedCount, possibleTargets.length, 'for', field, nextLargestGroup
|
||||||
alreadyDone = true
|
alreadyDone = true
|
||||||
continue if alreadyDone
|
continue if alreadyDone
|
||||||
nSamples = Math.min 15, Math.max(4, Math.floor possibleTargets.length / 20)
|
nSamples = Math.min 15, Math.max(4, Math.floor possibleTargets.length / 20)
|
||||||
console.log 'Checking', nSamples, 'samples of', possibleTargets.length, 'players in the biggest', field, 'group:', nextLargestGroup
|
if debugging then console.log 'Checking', nSamples, 'samples of', possibleTargets.length, 'players in the biggest', field, 'group:', nextLargestGroup
|
||||||
for i in [0 ... nSamples]
|
for i in [0 ... nSamples]
|
||||||
target = possibleTargets[Math.floor i * possibleTargets.length / (nSamples + 1)]
|
target = possibleTargets[Math.floor i * possibleTargets.length / (nSamples + 1)]
|
||||||
suggestions = findSuggestions target
|
suggestions = findSuggestions target
|
||||||
|
@ -130,26 +143,28 @@ findUserToSchool = (users) ->
|
||||||
when 'Name' then 30
|
when 'Name' then 30
|
||||||
when 'Referrer' then 20
|
when 'Referrer' then 20
|
||||||
when 'Domain' then (if getDomain(target) is 'cps.edu' then 1 else 10)
|
when 'Domain' then (if getDomain(target) is 'cps.edu' then 1 else 10)
|
||||||
when 'Clans' then 0.1
|
when 'Clans' then 0.01
|
||||||
sum
|
sum
|
||||||
), 0
|
), 0
|
||||||
if reasons > mostReasons
|
if reasons > mostReasons
|
||||||
bestTarget = target
|
bestTarget = target
|
||||||
bestTargetSuggestions = suggestions
|
bestTargetSuggestions = suggestions
|
||||||
mostReasons = reasons
|
mostReasons = reasons
|
||||||
|
bestGroup = nextLargestGroup
|
||||||
break
|
break
|
||||||
|
checkedTopGroups[bestGroup] = true
|
||||||
return [bestTarget, bestTargetSuggestions]
|
return [bestTarget, bestTargetSuggestions]
|
||||||
|
|
||||||
findSuggestions = (target) ->
|
findSuggestions = (target) ->
|
||||||
# Look for other users with the same IP, course instances, clans, or similar school names or non-common shared email domains.
|
# Look for other users with the same IP, course instances, clans, or similar school names or non-common shared email domains.
|
||||||
suggestions = []
|
suggestions = []
|
||||||
t0 = new Date()
|
t0 = new Date()
|
||||||
console.log ' Checking suggestions for', target.emailLower, target.schoolName, (new Date()) - t0
|
if debugging then console.log ' Checking suggestions for', target.emailLower, target.schoolName, (new Date()) - t0
|
||||||
if target.lastIP
|
if target.lastIP
|
||||||
for otherUser in (userCategories.lastIP[target.lastIP] ? []) when otherUser isnt target
|
for otherUser in (userCategories.lastIP[target.lastIP] ? []) when otherUser isnt target
|
||||||
suggestions.push schoolName: otherUser.schoolName, reasons: ['IP'], user: otherUser
|
suggestions.push schoolName: otherUser.schoolName, reasons: ['IP'], user: otherUser
|
||||||
for leagueType in ['courseInstances', 'clans']
|
for leagueType in ['courseInstances', 'clans']
|
||||||
console.log ' Now checking', leagueType, (new Date()) - t0
|
if debugging then console.log ' Now checking', leagueType, (new Date()) - t0
|
||||||
if target[leagueType]?.length
|
if target[leagueType]?.length
|
||||||
for league in target[leagueType]
|
for league in target[leagueType]
|
||||||
for otherUser in (userCategories[leagueType][league] ? []) when otherUser isnt target
|
for otherUser in (userCategories[leagueType][league] ? []) when otherUser isnt target
|
||||||
|
@ -159,7 +174,7 @@ findSuggestions = (target) ->
|
||||||
else
|
else
|
||||||
suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser
|
suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser
|
||||||
if target.schoolName?.length > 5
|
if target.schoolName?.length > 5
|
||||||
console.log ' Now checking schoolName', (new Date()) - t0
|
if debugging then console.log ' Now checking schoolName', (new Date()) - t0
|
||||||
nameMatches = []
|
nameMatches = []
|
||||||
for otherSchoolName in topGroups.schoolName
|
for otherSchoolName in topGroups.schoolName
|
||||||
score = stringScore otherSchoolName, target.schoolName, 0.8
|
score = stringScore otherSchoolName, target.schoolName, 0.8
|
||||||
|
@ -173,7 +188,7 @@ findSuggestions = (target) ->
|
||||||
existingSuggestion.reasons.push reason
|
existingSuggestion.reasons.push reason
|
||||||
else
|
else
|
||||||
suggestions.push schoolName: match, reasons: [reason], user: otherUser
|
suggestions.push schoolName: match, reasons: [reason], user: otherUser
|
||||||
console.log ' Now checking domain', (new Date()) - t0
|
if debugging then console.log ' Now checking domain', (new Date()) - t0
|
||||||
if domain = getDomain target
|
if domain = getDomain target
|
||||||
for otherUser in (userCategories.domain[domain] ? []) when otherUser isnt target
|
for otherUser in (userCategories.domain[domain] ? []) when otherUser isnt target
|
||||||
reason = "Domain"
|
reason = "Domain"
|
||||||
|
@ -181,7 +196,7 @@ findSuggestions = (target) ->
|
||||||
existingSuggestion.reasons.push reason
|
existingSuggestion.reasons.push reason
|
||||||
else
|
else
|
||||||
suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser
|
suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser
|
||||||
console.log ' Now checking referrer', (new Date()) - t0
|
if debugging then console.log ' Now checking referrer', (new Date()) - t0
|
||||||
if referrer = getReferrer target
|
if referrer = getReferrer target
|
||||||
for otherUser in (userCategories.referrer[referrer] ? []) when otherUser isnt target
|
for otherUser in (userCategories.referrer[referrer] ? []) when otherUser isnt target
|
||||||
reason = "Referrer"
|
reason = "Referrer"
|
||||||
|
@ -189,6 +204,7 @@ findSuggestions = (target) ->
|
||||||
existingSuggestion.reasons.push reason
|
existingSuggestion.reasons.push reason
|
||||||
else
|
else
|
||||||
suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser
|
suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser
|
||||||
|
if debugging then console.log ' Done checking referrer', (new Date()) - t0
|
||||||
suggestions = _.sortBy suggestions, 'schoolName'
|
suggestions = _.sortBy suggestions, 'schoolName'
|
||||||
suggestions = _.sortBy suggestions, (s) -> -s.reasons.length
|
suggestions = _.sortBy suggestions, (s) -> -s.reasons.length
|
||||||
return suggestions
|
return suggestions
|
||||||
|
@ -226,20 +242,21 @@ typoCache = {}
|
||||||
getDomain = (user) ->
|
getDomain = (user) ->
|
||||||
return null unless domain = user.emailLower.split('@')[1]
|
return null unless domain = user.emailLower.split('@')[1]
|
||||||
return null if commonEmailDomainMap[domain]
|
return null if commonEmailDomainMap[domain]
|
||||||
typo = typoCache[domain]
|
# Too slow? Is this actually slow?
|
||||||
return null if typo
|
#typo = typoCache[domain]
|
||||||
return domain if typo is false
|
#return null if typo
|
||||||
typo = _.find commonEmailDomains, (commonDomain) -> stringScore(commonDomain, domain, 0.8) > 0.9
|
#return domain if typo is false
|
||||||
typoCache[domain] = Boolean(typo)
|
#typo = _.find commonEmailDomains, (commonDomain) -> stringScore(commonDomain, domain, 0.8) > 0.9
|
||||||
return null if typo
|
#typoCache[domain] = Boolean(typo)
|
||||||
|
#return null if typo
|
||||||
domain
|
domain
|
||||||
|
|
||||||
commonReferrersRegex = /(google|bing\.|yahoo|duckduckgo|jobs\.lever|code\.org|twitter|facebook|dollarclick|stumbleupon|vk\.com|playpcesor|reddit|lifehacker|favorite|bnext|freelance|taringa|blogthinkbig|graphism|inside\.com|korben|habrahabr|iplaysoft|geekbrains|playground|ycombinator|github\.com)/
|
commonReferrersRegex = /(google|bing\.|yahoo|duckduckgo|jobs\.lever|code\.org|twitter|facebook|dollarclick|stumbleupon|vk\.com|playpcesor|reddit|lifehacker|favorite|bnext|freelance|taringa|blogthinkbig|graphism|inside\.com|korben|habrahabr|iplaysoft|geekbrains|playground|ycombinator|github)/
|
||||||
getReferrer = (user) ->
|
getReferrer = (user) ->
|
||||||
return null unless referrer = user.referrer?.toLowerCase().trim()
|
return null unless referrer = user.referrer?.toLowerCase().trim()
|
||||||
referrer = referrer.replace /^https?:\/\//, ''
|
referrer = referrer.replace /^https?:\/\//, ''
|
||||||
return null if commonReferrersRegex.test referrer
|
return null if commonReferrersRegex.test referrer
|
||||||
return classCode if classCode = referrer.match(/\?_cc=(\S+)/)?[1]
|
return classCode if classCode = referrer.match(/\?_cc=(\S+)$/)?[1]
|
||||||
return null if /codecombat/.test referrer
|
return null if /codecombat/.test referrer
|
||||||
referrer
|
referrer
|
||||||
|
|
||||||
|
@ -316,7 +333,7 @@ commonEmailDomains = [
|
||||||
"virgin.net", "wanadoo.co.uk", "bt.com",
|
"virgin.net", "wanadoo.co.uk", "bt.com",
|
||||||
|
|
||||||
# Domains used in Asia
|
# Domains used in Asia
|
||||||
"sina.com", "qq.com", "naver.com", "hanmail.net", "daum.net", "nate.com", "yahoo.co.jp", "yahoo.co.kr", "yahoo.co.id", "yahoo.co.in", "yahoo.com.sg", "yahoo.com.ph",
|
"sina.com", "qq.com", "naver.com", "hanmail.net", "daum.net", "nate.com", "yahoo.co.jp", "yahoo.co.kr", "yahoo.co.id", "yahoo.co.in", "yahoo.com.sg", "yahoo.com.ph", "yahoo.com.tw"
|
||||||
|
|
||||||
# French ISP domains
|
# French ISP domains
|
||||||
"hotmail.fr", "live.fr", "laposte.net", "yahoo.fr", "wanadoo.fr", "orange.fr", "gmx.fr", "sfr.fr", "neuf.fr", "free.fr",
|
"hotmail.fr", "live.fr", "laposte.net", "yahoo.fr", "wanadoo.fr", "orange.fr", "gmx.fr", "sfr.fr", "neuf.fr", "free.fr",
|
||||||
|
@ -337,4 +354,4 @@ commonEmailDomains = [
|
||||||
"hotmail.com", "gmail.com", "yahoo.com.mx", "live.com.mx", "yahoo.com", "hotmail.es", "live.com", "hotmail.com.mx", "prodigy.net.mx", "msn.com"
|
"hotmail.com", "gmail.com", "yahoo.com.mx", "live.com.mx", "yahoo.com", "hotmail.es", "live.com", "hotmail.com.mx", "prodigy.net.mx", "msn.com"
|
||||||
]
|
]
|
||||||
commonEmailDomainMap = {}
|
commonEmailDomainMap = {}
|
||||||
commonEmailDomainMap[domain] = true for domain in commonEmailDomainMap
|
commonEmailDomainMap[domain] = true for domain in commonEmailDomains
|
||||||
|
|
Loading…
Reference in a new issue