mirror of
https://github.com/codeninjasllc/codecombat.git
synced 2025-03-31 07:12:49 -04:00
Stub script to organize schoolName data
This commit is contained in:
parent
0f8a5673da
commit
c0609dafd5
1 changed files with 113 additions and 0 deletions
113
scripts/buildSchoolGraph.coffee
Normal file
113
scripts/buildSchoolGraph.coffee
Normal file
|
@ -0,0 +1,113 @@
|
|||
# Organize our users' schoolNames.
|
||||
|
||||
database = require '../server/commons/database'
|
||||
mongoose = require 'mongoose'
|
||||
log = require 'winston'
|
||||
async = require 'async'
|
||||
|
||||
### SET UP ###
|
||||
do (setupLodash = this) ->
|
||||
GLOBAL._ = require 'lodash'
|
||||
_.str = require 'underscore.string'
|
||||
_.mixin _.str.exports()
|
||||
GLOBAL.tv4 = require('tv4').tv4
|
||||
|
||||
database.connect()
|
||||
|
||||
UserHandler = require '../server/users/user_handler'
|
||||
User = require '../server/users/User'
|
||||
|
||||
startDate = new Date 2015, 11, 1
|
||||
|
||||
query = dateCreated: {$gt: startDate}, emailLower: {$exists: true}
|
||||
selection = 'name emailLower schoolName courseInstances clans ageRange dateCreated referrer points'
|
||||
User.find(query).select(selection).lean().exec (err, users) ->
|
||||
usersWithSchools = _.filter users, 'schoolName'
|
||||
schoolNames = _.uniq (u.schoolName for u in usersWithSchools)
|
||||
log.info "Found #{usersWithSchools.length} users of #{users.length} users registered after #{startDate} with schools like:\n#{schoolNames.slice(0, 10).join('\n')}"
|
||||
|
||||
# For each user, come up with a confidence that their school is correct.
|
||||
# For users with low confidence, look for similarities to other users with high confidence.
|
||||
# If we have enough data, prompt to update the school.
|
||||
# After each update, recalculate confidence to find the next user with low confidence.
|
||||
|
||||
# How do we come up with confidence estimate?
|
||||
# If there are many students with the same school name, it's either correct or a rename must happen.
|
||||
# If the school name is unique but similar to a school name with many students, it's probably incorrect.
|
||||
# But if we determine it is correct, how can we record this fact so it doesn't keep asking?
|
||||
|
||||
# How can we infer the school name when we think it's not correct?
|
||||
# We look for users with confident schoolNames in shared courseInstances.
|
||||
# ... in shared clans.
|
||||
# ... with the same lastIP that doesn't cover the lastIP of students from multiple schools.
|
||||
# If we find a school-district-formatted email domain, we could try to match to other schoolNames in that domain, but I doubt that will be helpful until we have a lot of data and a lot of time to manually look things up.
|
||||
|
||||
# TODO: do all this work when we actually have a bunch of schoolNames in the system, or these heuristics won't be well-calibrated.
|
||||
|
||||
nextPrompt users
|
||||
|
||||
nextPrompt = (users) ->
|
||||
return console.log('Done.') or process.exit() unless [userToSchool, suggestions] = findUserToSchool users
|
||||
prompt "What should the school for #{JSON.stringify(userToSchool)} be?\nSuggestions: #{suggestions}\n", (answer) ->
|
||||
return console.log('Bye.') or process.exit() if answer in ['q', 'quit']
|
||||
console.log "You said #{answer}, so we should do something about that."
|
||||
nextPrompt users
|
||||
|
||||
findUserToSchool = (users) ->
|
||||
users.sort (a, b) -> b.points - a.points
|
||||
usersWithSchools = _.filter users, 'schoolName'
|
||||
schoolNames = _.uniq (u.schoolName for u in usersWithSchools)
|
||||
return [users[0], schoolNames]
|
||||
|
||||
|
||||
# https://github.com/joshaven/string_score
|
||||
stringScore = (_a, word, fuzziness) ->
|
||||
return 1 if word is _a
|
||||
return 0 if word is ""
|
||||
|
||||
runningScore = 0
|
||||
string = _a
|
||||
lString = string.toLowerCase()
|
||||
strLength = string.length
|
||||
lWord = word.toLowerCase()
|
||||
wordLength = word.length
|
||||
startAt = 0
|
||||
fuzzies = 1
|
||||
|
||||
if fuzziness
|
||||
fuzzyFactor = 1 - fuzziness
|
||||
|
||||
if fuzziness
|
||||
for i in [0...wordLength]
|
||||
idxOf = lString.indexOf lWord[i], startAt
|
||||
if idxOf is -1
|
||||
fuzzies += fuzzyFactor
|
||||
else
|
||||
if startAt is idxOf
|
||||
charScore = 0.7
|
||||
else
|
||||
charScore = 0.1
|
||||
charScore += 0.8 if string[idxOf - 1] is ' '
|
||||
charScore += 0.1 if string[idxOf] is word[i]
|
||||
runningScore += charScore
|
||||
startAt = idxOf + 1
|
||||
else
|
||||
for i in [0...wordLength]
|
||||
idxOf = lString.indexOf lWord[i], startAt
|
||||
return 0 if idxOf is -1
|
||||
if startAt is idxOf
|
||||
charScore = 0.7
|
||||
else
|
||||
charScore = 0.1
|
||||
charScore += 0.8 if string[idxOf - 1] is word[i]
|
||||
runningScore += charScore
|
||||
startAt = idxOf + 1
|
||||
finalScore = 0.5 * (runningScore / strLength + runningScore / wordLength) / fuzzies
|
||||
finalScore += 0.15 if lWord[0] is lString[0] and finalScore < 0.85
|
||||
finalScore
|
||||
|
||||
prompt = (question, callback) ->
|
||||
process.stdin.resume()
|
||||
process.stdout.write question
|
||||
process.stdin.once 'data', (data) ->
|
||||
callback data.toString().trim()
|
Loading…
Add table
Reference in a new issue