-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #644 from jvalue/perf-text-file
[PERF] Don't split and join lines when using `TextFile`
- Loading branch information
Showing
7 changed files
with
342 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,3 +4,4 @@ | |
|
||
export * from './implements-static-decorator'; | ||
export * from './file-util'; | ||
export * from './string-util'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
// SPDX-FileCopyrightText: 2025 Friedrich-Alexander-Universitat Erlangen-Nurnberg | ||
// | ||
// SPDX-License-Identifier: AGPL-3.0-only | ||
|
||
// eslint-disable-next-line unicorn/prefer-node-protocol | ||
import { AssertionError } from 'assert'; | ||
|
||
import { either } from 'fp-ts'; | ||
|
||
import { ensureGlobal, findLineBounds } from './string-util'; | ||
|
||
describe('Validation of string-util', () => { | ||
describe('Function ensureGlobal', () => { | ||
it('should make a non global RegExp global', () => { | ||
const result = ensureGlobal(/someregex/); | ||
|
||
expect(result.global).toBe(true); | ||
expect(result.source).toBe('someregex'); | ||
}); | ||
it('should keep a global RegExp global', () => { | ||
const result = ensureGlobal(/someregex/g); | ||
|
||
expect(result.global).toBe(true); | ||
expect(result.source).toBe('someregex'); | ||
}); | ||
}); | ||
describe('Function findLineBounds', () => { | ||
it('should return empty array for empty array', () => { | ||
const result = findLineBounds([], /\r?\n/, 'some text'); | ||
|
||
expect(either.isRight(result)).toBe(true); | ||
assert(either.isRight(result)); | ||
expect(result.right).toStrictEqual([]); | ||
}); | ||
it('should return first non existent lineIdx', () => { | ||
const result = findLineBounds( | ||
[0, 30, 300], | ||
/\r?\n/, | ||
`some text | ||
`, | ||
); | ||
|
||
expect(either.isLeft(result)).toBe(true); | ||
assert(either.isLeft(result)); | ||
expect(result.left.firstNonExistentLineIdx).toBe(30); | ||
expect(result.left.existingBounds).toStrictEqual([ | ||
{ start: 0, length: 10 }, | ||
]); | ||
}); | ||
it('should return the entire string if there is no newline', () => { | ||
const result = findLineBounds( | ||
[0, 1], | ||
/\r?\n/, | ||
'some text without a newline', | ||
); | ||
|
||
expect(either.isLeft(result)).toBe(true); | ||
assert(either.isLeft(result)); | ||
expect(result.left.firstNonExistentLineIdx).toBe(1); | ||
expect(result.left.existingBounds).toStrictEqual([ | ||
{ start: 0, length: 27 }, | ||
]); | ||
}); | ||
it('should correctly map multiple indices', () => { | ||
const result = findLineBounds( | ||
[0, 1, 2, 3], | ||
/\r?\n/, | ||
`some | ||
text with | ||
newlines | ||
`, | ||
); | ||
|
||
expect(either.isLeft(result)).toBe(true); | ||
assert(either.isLeft(result)); | ||
expect(result.left.firstNonExistentLineIdx).toBe(3); | ||
expect(result.left.existingBounds).toStrictEqual([ | ||
{ start: 0, length: 5 }, | ||
{ start: 5, length: 11 }, | ||
{ start: 16, length: 9 }, | ||
]); | ||
}); | ||
it('should throw an error on out of order indices', () => { | ||
expect(() => findLineBounds([1, 0], /\r?\n/, '')).toThrowError( | ||
AssertionError, | ||
); | ||
}); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
// SPDX-FileCopyrightText: 2025 Friedrich-Alexander-Universitat Erlangen-Nurnberg | ||
// | ||
// SPDX-License-Identifier: AGPL-3.0-only | ||
|
||
// eslint-disable-next-line unicorn/prefer-node-protocol | ||
import assert from 'assert'; | ||
|
||
import { either } from 'fp-ts'; | ||
|
||
export function ensureGlobal(regex: RegExp): RegExp { | ||
if (regex.global) { | ||
return regex; | ||
} | ||
|
||
return RegExp(regex.source, regex.flags + 'g'); | ||
} | ||
|
||
function isSortedAscending(numbers: number[]): boolean { | ||
return numbers.every((lineIdx, i, arr) => { | ||
if (i === 0) { | ||
return true; | ||
} | ||
const prev = arr[i - 1]; | ||
assert(prev !== undefined); | ||
return prev <= lineIdx; | ||
}); | ||
} | ||
|
||
function findSingleLineBounds( | ||
searchIdx: number, | ||
lineBreakPattern: RegExp, | ||
text: string, | ||
): { start: number; length: number } | undefined { | ||
let currentLineIdx = 0; | ||
let currentLineStart = 0; | ||
|
||
for (const lineBreak of text.matchAll(ensureGlobal(lineBreakPattern))) { | ||
assert(currentLineIdx <= searchIdx); | ||
if (currentLineIdx < searchIdx) { | ||
currentLineIdx += 1; | ||
currentLineStart += lineBreak.index + 1; | ||
continue; | ||
} | ||
|
||
const lineLengthWithoutNewline = lineBreak.index - currentLineStart; | ||
return { | ||
start: currentLineStart, | ||
length: lineLengthWithoutNewline + 1, | ||
}; | ||
} | ||
|
||
// HINT: Line with idx `lineIdx` not found. | ||
if (currentLineIdx !== searchIdx) { | ||
return undefined; | ||
} | ||
return { | ||
start: currentLineStart, | ||
length: text.length - currentLineStart, | ||
}; | ||
} | ||
|
||
type Bounds = { start: number; length: number }[]; | ||
|
||
/** | ||
* Map line idxs to line bounds. | ||
* | ||
* @param lineIdxs the indices of the lines to find bounds for. MUST be sorted in ASCENDING order. | ||
* @param lineBreakPattern the pattern that marks a new line. | ||
* @param text the text containing newlines. | ||
* @returns a new array which contains either the bounds for the requested line or undefined | ||
* | ||
* @example | ||
* let [{start, length}, outOfBounds ] = findLineBounds("some\ntext\n", /\r?\n/, [0, 300]); | ||
* assert(inclusiveStart === 0); | ||
* assert(length === 5); | ||
* assert(outOfBounds === undefined); | ||
*/ | ||
export function findLineBounds( | ||
lineIdxs: number[], | ||
lineBreakPattern: RegExp, | ||
text: string, | ||
): either.Either< | ||
{ existingBounds: Bounds; firstNonExistentLineIdx: number }, | ||
Bounds | ||
> { | ||
assert(isSortedAscending(lineIdxs)); | ||
let lineIdxOffset = 0; | ||
let charIdxOffset = 0; | ||
|
||
const bounds: { start: number; length: number }[] = []; | ||
|
||
for (const searchIdx of lineIdxs) { | ||
if (searchIdx > 0 && text.length === 0) { | ||
return either.left({ | ||
existingBounds: bounds, | ||
firstNonExistentLineIdx: searchIdx, | ||
}); | ||
} | ||
assert(searchIdx >= lineIdxOffset); | ||
const tmp = findSingleLineBounds( | ||
searchIdx - lineIdxOffset, | ||
lineBreakPattern, | ||
text, | ||
); | ||
if (tmp === undefined) { | ||
return either.left({ | ||
existingBounds: bounds, | ||
firstNonExistentLineIdx: searchIdx, | ||
}); | ||
} | ||
|
||
const { start, length } = tmp; | ||
|
||
bounds.push({ | ||
start: charIdxOffset + start, | ||
length, | ||
}); | ||
|
||
charIdxOffset += start + length; | ||
lineIdxOffset = searchIdx + 1; | ||
text = text.slice(length); | ||
} | ||
|
||
return either.right(bounds); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.