about summary refs log tree commit diff
path: root/src/pronouns/util.clj
diff options
context:
space:
mode:
authorErik Osheim <d_m@plastic-idolatry.com>2015-03-15 01:37:15 -0400
committerErik Osheim <d_m@plastic-idolatry.com>2015-03-15 01:37:15 -0400
commit573294ba752823d2fb6a708acf2533a6e14752c9 (patch)
tree0877789d068a84acd6845a8bb34d4c263310859e /src/pronouns/util.clj
parent08cadfb6d3bf39d43a05db6e14d6b43bcad9ee60 (diff)
Add code to disambiguate pronouns dynamically.
This commit improves the display code so that the
smallest unambiguous abbreviation is calculated
based on the contents of pronouns.tab.
Diffstat (limited to 'src/pronouns/util.clj')
-rw-r--r--src/pronouns/util.clj29
1 files changed, 29 insertions, 0 deletions
diff --git a/src/pronouns/util.clj b/src/pronouns/util.clj
index 1269664..1d4c09a 100644
--- a/src/pronouns/util.clj
+++ b/src/pronouns/util.clj
@@ -14,3 +14,32 @@
 (defn tabfile-lookup
   [query-key tabfile]
   (table-lookup query-key (slurp-tabfile tabfile)))
+
+; given a row and its lexically-closest neighbors,
+; determine the smallest abbreviation which is still
+; distinct.
+(defn disambiguate
+  [prev row next]
+  (loop [n 1]
+    (let [row-n (take n row)]
+      (cond
+        (>= n 5) row
+        (= row-n (take n prev)) (recur (+ n 1))
+        (= row-n (take n next)) (recur (+ n 1))
+        :else row-n))))
+
+; given a list of pronoun rows, return a list of
+; pairs, where the first item is the abbreviation
+; and the second is the original pronoun row.
+(defn abbreviate
+  [sorted-table]
+  (loop [acc nil
+         prev nil
+         row (first sorted-table)
+         todo (rest sorted-table)]
+    (let [next (first todo)
+          abbrev (disambiguate prev row next)
+          pair (list abbrev row)
+          acc2 (conj acc pair)]
+      (if (empty? todo) (reverse acc2)
+          (recur acc2 row next (rest todo))))))