diff options
author | Morgan <m@morganastra.me> | 2015-03-15 02:07:06 -0700 |
---|---|---|
committer | Morgan <m@morganastra.me> | 2015-03-15 02:07:06 -0700 |
commit | 25bab46051adf02b9d4a82077ff8e69ed219c6aa (patch) | |
tree | cc46e5e261fffdd7648f5eb9a8651fa5bc49b72a /src/pronouns/util.clj | |
parent | 398d0e1958b1f357f650146a5fc1942adf033de6 (diff) | |
parent | 2be8d2cd5e6b29b75aa93108bf738dd2d3b0bb21 (diff) |
Merge pull request #17 from non/topic/disambiguate
Add code to disambiguate pronouns dynamically.
Diffstat (limited to 'src/pronouns/util.clj')
-rw-r--r-- | src/pronouns/util.clj | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/src/pronouns/util.clj b/src/pronouns/util.clj index 1269664..8327568 100644 --- a/src/pronouns/util.clj +++ b/src/pronouns/util.clj @@ -14,3 +14,32 @@ (defn tabfile-lookup [query-key tabfile] (table-lookup query-key (slurp-tabfile tabfile))) + +(defn disambiguate + "given a row and its lexically-closest neighbors, + determine the smallest abbreviation which is still + distinct." + [prev row next] + (loop [n 1] + (let [row-n (take n row)] + (cond + (>= n 5) row + (= row-n (take n prev)) (recur (+ n 1)) + (= row-n (take n next)) (recur (+ n 1)) + :else row-n)))) + +(defn abbreviate + "given a list of pronoun rows, return a list of + pairs, where the first item is the abbreviation + and the second is the original pronoun row." + [sorted-table] + (loop [acc nil + prev nil + row (first sorted-table) + todo (rest sorted-table)] + (let [next (first todo) + abbrev (disambiguate prev row next) + pair (list abbrev row) + acc2 (conj acc pair)] + (if (empty? todo) (reverse acc2) + (recur acc2 row next (rest todo)))))) |