about summary refs log tree commit diff
path: root/src/pronouns/util.clj
diff options
context:
space:
mode:
Diffstat (limited to 'src/pronouns/util.clj')
-rw-r--r--src/pronouns/util.clj29
1 files changed, 29 insertions, 0 deletions
diff --git a/src/pronouns/util.clj b/src/pronouns/util.clj
index 1269664..1d4c09a 100644
--- a/src/pronouns/util.clj
+++ b/src/pronouns/util.clj
@@ -14,3 +14,32 @@
 (defn tabfile-lookup
   [query-key tabfile]
   (table-lookup query-key (slurp-tabfile tabfile)))
+
+; given a row and its lexically-closest neighbors,
+; determine the smallest abbreviation which is still
+; distinct.
+(defn disambiguate
+  [prev row next]
+  (loop [n 1]
+    (let [row-n (take n row)]
+      (cond
+        (>= n 5) row
+        (= row-n (take n prev)) (recur (+ n 1))
+        (= row-n (take n next)) (recur (+ n 1))
+        :else row-n))))
+
+; given a list of pronoun rows, return a list of
+; pairs, where the first item is the abbreviation
+; and the second is the original pronoun row.
+(defn abbreviate
+  [sorted-table]
+  (loop [acc nil
+         prev nil
+         row (first sorted-table)
+         todo (rest sorted-table)]
+    (let [next (first todo)
+          abbrev (disambiguate prev row next)
+          pair (list abbrev row)
+          acc2 (conj acc pair)]
+      (if (empty? todo) (reverse acc2)
+          (recur acc2 row next (rest todo))))))