Skip to content

Commit

Permalink
improve-speaker-cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
uriva committed Jan 25, 2024
1 parent 8664991 commit 2897d23
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 16 deletions.
20 changes: 18 additions & 2 deletions src/index.test.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import { sideLog } from "https://deno.land/x/gamla@43.0.0/src/debug.ts";
import { regexpTimes } from "./index.ts";
import { matchesRegexp } from "./index.ts";
import {
approximateSemanticEquality,
capitalizedPrefix,
Expand All @@ -12,6 +15,7 @@ import {
} from "./index.ts";

import { assertEquals } from "https://deno.land/std@0.192.0/testing/asserts.ts";
import { regexpEntireString } from "./index.ts";

type Func = (...args: any[]) => any;

Expand Down Expand Up @@ -113,6 +117,10 @@ testUnaryFn(
"cleanSpeakers",
cleanSpeakers,
)([
[
"Mr. Darcy: You must know... surely, you must know it was all for you. You are too generous to trifle with me. I believe you spoke with my aunt last night, and it has taught me to hope as I'd scarcely allowed myself before. If your feelings are still what they were last April, tell me so at once. My affections and wishes have not changed, but one word from you will silence me forever. If, however, your feelings have changed, I will have to tell you: you have bewitched me, body and soul, and I love--I love--I love you. I never wish to be parted from you from this day on.",
"You must know... surely, you must know it was all for you. You are too generous to trifle with me. I believe you spoke with my aunt last night, and it has taught me to hope as I'd scarcely allowed myself before. If your feelings are still what they were last April, tell me so at once. My affections and wishes have not changed, but one word from you will silence me forever. If, however, your feelings have changed, I will have to tell you: you have bewitched me, body and soul, and I love--I love--I love you. I never wish to be parted from you from this day on.",
],
[
"Mr. Collins : Charlotte, come here. Charlotte Lucas : Has the pig escaped again? Charlotte Lucas : Oh. It's Lady Catherine.",
"Charlotte, come here. Has the pig escaped again? Oh. It's Lady Catherine.",
Expand All @@ -131,11 +139,11 @@ testUnaryFn(
],
[
'Jake Sully: Neytiri calls me skxawng. It means "moron."',
"Neytiri calls me skxawng. It means moron.",
'Neytiri calls me skxawng. It means "moron."',
],
[
'"Jesus! Did I SAY that? Or just think it? Was I talking? Did they hear me? I glanced over at my attorney, but he seemed oblivious…"― Hunter S. Thompson',
"Jesus! Did I SAY that? Or just think it? Was I talking? Did they hear me? I glanced over at my attorney, but he seemed oblivious…",
'"Jesus! Did I SAY that? Or just think it? Was I talking? Did they hear me? I glanced over at my attorney, but he seemed oblivious…"',
],
]);

Expand All @@ -157,3 +165,11 @@ testUnaryFn(
]);

testUnaryFn("simplify", simplify)([["M*A*S*H", "mash"]]);

testUnaryFn(
"times",
matchesRegexp(regexpEntireString(regexpTimes(1, 3, /a/))),
)([
["aa", true],
["aaaa", false],
]);
30 changes: 16 additions & 14 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import {
join,
letIn,
lowercase,
map,
max,
nonempty,
pipe,
Expand All @@ -17,6 +16,7 @@ import {
trimWhitespace,
} from "https://deno.land/x/gamla@43.0.0/src/index.ts";

import { remove } from "https://deno.land/x/gamla@43.0.0/src/filter.ts";
import { fuzzySearch as fs } from "npm:levenshtein-search";
import { englishWords } from "./englishWords.ts";
import { stopWords } from "./stopWords.ts";
Expand Down Expand Up @@ -241,6 +241,9 @@ export const quotedTexts = (input: string): string[] => {
export const concatRegexp = (x: RegExp, y: RegExp) =>
new RegExp(x.source + y.source, combineFlags(x, y));

export const regexpEntireString = (x: RegExp) =>
new RegExp(`^${x.source}$`, x.flags);

const combineFlags = (x: RegExp, y: RegExp) =>
(x.flags + y.flags)
.split("")
Expand Down Expand Up @@ -286,44 +289,43 @@ export const oneOrMore = (x: RegExp) =>

export const globalize = addFlag("g");

export const regexpTimes = (min: number, max: number, x: RegExp) =>
new RegExp(`${bracketIfNeeded(x.source)}{${min},${max}}`, x.flags);

const speakerTitle = [/ms\./, /mrs\./, /mr\./, /dr\./]
.map(caseInsensitive)
.reduce(regExpOr);

const personName = [
optional(concatRegexp(speakerTitle, /\s/)),
zeroOrMore(/'?[A-Z][\w-]*\.?'?\s/),
regexpTimes(0, 2, /'?[A-Z][\w-]*\.?'?\s/),
/[\w-]+/,
].reduce(concatRegexp);

const hyphen = /[-]/;

const boundry = [/[@.-\s:/בלה[\]?&%$#=*,!()]/, /^/, /$/].reduce(regExpOr); // \b doesn't work for non ascii

const speaker = globalize(
[boundry, optional(hyphen), personName, /\s?:/, boundry].reduce(concatRegexp),
const speaker = [optional(hyphen), personName, /\s?:/, boundry].reduce(
concatRegexp,
);

const speakerInEnd = [hyphen, /\s*/, personName, /$/].reduce(concatRegexp);

export const negativeLookBehind = (x: RegExp) =>
new RegExp(`(?<!${x.source})`, x.flags);

const splitSentences = split(/(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=[.?])\s/);
const splitSentences = split(/(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=[,.?:])\s/);

export const matchesRegexp = (r: RegExp) => (txt: string) => r.test(txt);

export const cleanSpeakers = pipe(
splitSentences,
map(
pipe(
split(speaker),
join(" "),
replace(/\s+/g, " "),
replace(/"/g, ""),
trimWhitespace,
),
),
remove(pipe(trimWhitespace, matchesRegexp(regexpEntireString(speaker)))),
join(" "),
replace(/\s+/g, " "),
replace(speakerInEnd, ""),
trimWhitespace,
);

export const ngramsOfAtLeastNWords = (n: number) => (s: string) => {
Expand Down

0 comments on commit 2897d23

Please sign in to comment.