diff --git a/scripts/buildSchoolGraph.coffee b/scripts/buildSchoolGraph.coffee index edc00f33b..7236c4a55 100644 --- a/scripts/buildSchoolGraph.coffee +++ b/scripts/buildSchoolGraph.coffee @@ -17,14 +17,15 @@ database.connect() UserHandler = require '../server/users/user_handler' User = require '../server/users/User' -startDate = new Date 2015, 11, 1 +#startDate = new Date 2015, 11, 1 +startDate = new Date 2015, 11, 8 # Testing query = dateCreated: {$gt: startDate}, emailLower: {$exists: true} -selection = 'name emailLower schoolName courseInstances clans ageRange dateCreated referrer points' +selection = 'name emailLower schoolName courseInstances clans ageRange dateCreated referrer points lastIP hourOfCode preferredLanguage lastLevel' User.find(query).select(selection).lean().exec (err, users) -> usersWithSchools = _.filter users, 'schoolName' schoolNames = _.uniq (u.schoolName for u in usersWithSchools) - log.info "Found #{usersWithSchools.length} users of #{users.length} users registered after #{startDate} with schools like:\n#{schoolNames.slice(0, 10).join('\n')}" + log.info "Found #{usersWithSchools.length} users of #{users.length} users registered after #{startDate} with schools like:\n\t#{schoolNames.slice(0, 10).join('\n\t')}" # For each user, come up with a confidence that their school is correct. # For users with low confidence, look for similarities to other users with high confidence. @@ -42,22 +43,138 @@ User.find(query).select(selection).lean().exec (err, users) -> # ... with the same lastIP that doesn't cover the lastIP of students from multiple schools. # If we find a school-district-formatted email domain, we could try to match to other schoolNames in that domain, but I doubt that will be helpful until we have a lot of data and a lot of time to manually look things up. - # TODO: do all this work when we actually have a bunch of schoolNames in the system, or these heuristics won't be well-calibrated. - nextPrompt users -nextPrompt = (users) -> +nextPrompt = (users, question) -> + sortUsers users return console.log('Done.') or process.exit() unless [userToSchool, suggestions] = findUserToSchool users - prompt "What should the school for #{JSON.stringify(userToSchool)} be?\nSuggestions: #{suggestions}\n", (answer) -> + question ?= formatSuggestions userToSchool, suggestions + prompt question, (answer) -> return console.log('Bye.') or process.exit() if answer in ['q', 'quit'] - console.log "You said #{answer}, so we should do something about that." + answer = answer.trim() + if answer is '' + users = _.without users, userToSchool + else unless _.isNaN(num = parseInt(answer, 10)) + schoolName = if num then suggestions[num - 1]?.schoolName else userToSchool.schoolName + return finalizePrompt userToSchool, suggestions, schoolName, users + else if answer.length < 10 + console.log "#{answer}? That's kind of short--I don't think school names and locations can be this short. What should it really be?" + return nextPrompt users, "> " + else + return finalizePrompt userToSchool, suggestions, answer, users nextPrompt users +finalizePrompt = (userToSchool, suggestions, schoolName, users) -> + console.log "Selected schoolName: \"#{schoolName}\"" + question = "Also apply this to other users? Ex.: 'all', '0 1 2 5', 'all -3 -4 -5', '0' to just do this one, or blank to retype school name.\n> " + prompt question, (answer) -> + answer = answer.trim() + if answer is '' + console.log "Should just do", userToSchool._id, userToSchool.emailLower, userToSchool.schoolName + targets = [userToSchool] + else if answer is 'all' + targets = [userToSchool].concat (s.user for s in suggestions) + console.log "Doing all #{targets.length} users..." + else if /^all/.test answer + targets = [userToSchool].concat (s.user for s in suggestions) + numbers = _.filter (parseInt(d, 10) for d in answer.split(/ *-/)), (n) -> not _.isNaN n + for number in numbers + skip = if number then suggestions[number - 1].user else userToSchool + targets = _.without targets, skip + console.log "Doing all #{targets.length} users without #{numbers}..." + else + numbers = _.filter (parseInt(d, 10) for d in answer.split(/ +/)), (n) -> not _.isNaN n + targets = ((if number then suggestions[number - 1].user else userToSchool) for number in numbers) + console.log "Doing #{targets.length} users for #{numbers}..." + #User.update {_id: {$in: (_.map targets, '_id')}}, {schoolName: schoolName}, {multi: true}, (err, result) -> + User.update {_id: {$in: []}}, {schoolName: schoolName}, {multi: true}, (err, result) -> + if err + console.error "Ran into error doing the save:", err + return finalizePrompt userToSchool, suggestions, schoolName, users + console.log "Updated users' schoolNames. Result:", result + remainingUsers = _.without users, targets... + nextPrompt remainingUsers + +formatUser = (user) -> + # TODO: replace date string with relative time since signup compared to target user + _.values(_.pick(user, ['name', 'emailLower', 'ageRange', 'dateCreated', 'lastLevel', 'points', 'referrer', 'hourOfCode'])).join(' ') + +formatSuggestions = (userToSchool, suggestions) -> + suggestionPrompts = ("#{_.str.rpad(i + 1, 3)} #{_.str.rpad(s.schoolName, 50)} #{s.reasons.join(' + ')}\tfrom user: #{formatUser(s.user)}" for s, i in suggestions).join('\n') + """ + What should the school for this user be? + 0 #{_.str.rpad(userToSchool.schoolName, 50)} #{formatUser(userToSchool)} + Suggestions: + #{suggestionPrompts} + Choose a number, type a name, enter to skip, or q to quit. + > """ + findUserToSchool = (users) -> - users.sort (a, b) -> b.points - a.points - usersWithSchools = _.filter users, 'schoolName' - schoolNames = _.uniq (u.schoolName for u in usersWithSchools) - return [users[0], schoolNames] + # TODO: don't show users where everyone in the suggestion already has the same school (because we have already done this group) + [bestTarget, bestTargetSuggestions, mostReasons] = [null, [], 0] + for field, groups of topGroups + largestGroup = groups[0] + target = userCategories[field][largestGroup][0] + suggestions = findSuggestions target + reasons = _.reduce suggestions, ((sum, suggestion) -> sum + (if suggestion.schoolName then suggestion.reasons.length else 0)), 0 + if reasons > mostReasons + bestTarget = target + bestTargetSuggestions = suggestions + mostReasons = reasons + return [bestTarget, bestTargetSuggestions] + +findSuggestions = (target) -> + suggestions = [] + if target.lastIP + for otherUser in userCategories.lastIP[target.lastIP] when otherUser isnt target + suggestions.push schoolName: otherUser.schoolName, reasons: ["IP match"], user: otherUser + for leagueType in ['courseInstances', 'clans'] + if target[leagueType]?.length + for league in target[leagueType] + for otherUser in userCategories[leagueType][league] when otherUser isnt target + reason = "#{_.str.humanize(leagueType)} match" + if existingSuggestion = _.find(suggestions, user: otherUser) + existingSuggestion.reasons.push reason + else + suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser + if target.schoolName?.length > 5 + nameMatches = [] + for otherSchoolName in topGroups.schoolName + score = stringScore otherSchoolName, target.schoolName, 0.8 + continue if score < 0.25 + nameMatches.push schoolName: otherSchoolName, score: score + nameMatches = (match.schoolName for match in (_.sortBy nameMatches, (match) -> -match.score)) + for match in nameMatches.slice(0, 10) + reason = "Name match" + for otherUser in userCategories.schoolName[match] when otherUser isnt target + if existingSuggestion = _.find(suggestions, user: otherUser) + existingSuggestion.reasons.push reason + else + suggestions.push schoolName: match, reasons: [reason], user: otherUser + return _.uniq suggestions, 'user' + +userCategories = {} +topGroups = {} +usersCategorized = {} + +sortUsers = (users) -> + users = _.sortBy users, (u) -> -u.points + users = _.sortBy users, ['schoolName', 'lastIP'] + # TODO: also match users by shared school email domains when we can identify those + for field in ['courseInstances', 'lastIP', 'schoolName', 'clans'] + userCategories[field] = categorizeUsers users, field + topGroups[field] = _.sortBy _.keys(userCategories[field]), (key) -> -userCategories[field][key].length + topGroups[field] = (group for group in topGroups[field] when 2 < userCategories[field][group].length < (if field is 'clans' then 30 else 5000)) + +categorizeUsers = (users, field) -> + categories = {} + for user in users when value = user[field] + values = if _.isArray(value) then value else [value] + for value in values when value + continue if value.trim and not value.trim() + categories[value] ?= [] + categories[value].push user + categories # https://github.com/joshaven/string_score