Skip to content

Commit edc75a6

Browse files
committed
fix: search for empty string (#10985)
This PR ensures that searching for an empty string returns the expected pattern of alternating size-zero matches and size-one rejects. In particular, splitting by an empty string returns an array formed of the empty string, all of the string's characters as singleton strings, followed by another empty string. This matches the [Rust behavior](https://doc.rust-lang.org/std/primitive.str.html#method.split), for example.
1 parent 744f980 commit edc75a6

File tree

3 files changed

+60
-37
lines changed

3 files changed

+60
-37
lines changed

src/Init/Data/String/Pattern/Basic.lean

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ inductive SearchStep (s : Slice) where
3838
The subslice starting at {name}`startPos` and ending at {name}`endPos` did not match the pattern.
3939
-/
4040
| matched (startPos endPos : s.Pos)
41-
deriving Inhabited
41+
deriving Inhabited, BEq
4242

4343
/--
4444
Provides a conversion from a pattern to an iterator of {name}`SearchStep` that searches for matches

src/Init/Data/String/Pattern/String.lean

Lines changed: 37 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ public section
2222
namespace String.Slice.Pattern
2323

2424
inductive ForwardSliceSearcher (s : Slice) where
25-
| empty (pos : s.Pos)
25+
| emptyBefore (pos : s.Pos)
26+
| emptyAt (pos : s.Pos) (h : pos ≠ s.endPos)
2627
| proper (needle : Slice) (table : Array String.Pos.Raw) (stackPos : String.Pos.Raw) (needlePos : String.Pos.Raw)
2728
| atEnd
2829
deriving Inhabited
@@ -56,7 +57,7 @@ where
5657
@[inline]
5758
def iter (s : Slice) (pat : Slice) : Std.Iter (α := ForwardSliceSearcher s) (SearchStep s) :=
5859
if pat.utf8ByteSize == 0 then
59-
{ internalState := .empty s.startPos }
60+
{ internalState := .emptyBefore s.startPos }
6061
else
6162
{ internalState := .proper pat (buildTable pat) s.startPos.offset pat.startPos.offset }
6263

@@ -71,9 +72,8 @@ instance (s : Slice) : Std.Iterators.Iterator (ForwardSliceSearcher s) Id (Searc
7172
IsPlausibleStep it
7273
| .yield it' out =>
7374
match it.internalState with
74-
| .empty pos =>
75-
(∃ newPos, pos < newPos ∧ it'.internalState = .empty newPos) ∨
76-
it'.internalState = .atEnd
75+
| .emptyBefore pos => (∃ h, it'.internalState = .emptyAt pos h) ∨ it'.internalState = .atEnd
76+
| .emptyAt pos h => ∃ newPos, pos < newPos ∧ it'.internalState = .emptyBefore newPos
7777
| .proper needle table stackPos needlePos =>
7878
(∃ newStackPos newNeedlePos,
7979
stackPos < newStackPos ∧
@@ -85,12 +85,15 @@ instance (s : Slice) : Std.Iterators.Iterator (ForwardSliceSearcher s) Id (Searc
8585
| .done => True
8686
step := fun ⟨iter⟩ =>
8787
match iter with
88-
| .empty pos =>
88+
| .emptyBefore pos =>
8989
let res := .matched pos pos
9090
if h : pos ≠ s.endPos then
91-
pure (.deflate ⟨.yield ⟨.empty (pos.next h)⟩ res, by simp⟩)
91+
pure (.deflate ⟨.yield ⟨.emptyAt pos h⟩ res, by simp [h]⟩)
9292
else
9393
pure (.deflate ⟨.yield ⟨.atEnd⟩ res, by simp⟩)
94+
| .emptyAt pos h =>
95+
let res := .rejected pos (pos.next h)
96+
pure (.deflate ⟨.yield ⟨.emptyBefore (pos.next h)⟩ res, by simp⟩)
9497
| .proper needle table stackPos needlePos =>
9598
let rec findNext (startPos : String.Pos.Raw)
9699
(currStackPos : String.Pos.Raw) (needlePos : String.Pos.Raw) (h : stackPos ≤ currStackPos) :=
@@ -148,15 +151,17 @@ instance (s : Slice) : Std.Iterators.Iterator (ForwardSliceSearcher s) Id (Searc
148151
findNext stackPos stackPos needlePos (by simp)
149152
| .atEnd => pure (.deflate ⟨.done, by simp⟩)
150153

151-
private def toPair : ForwardSliceSearcher s → (Nat × Nat)
152-
| .empty pos => (1, s.utf8ByteSize - pos.offset.byteIdx)
153-
| .proper _ _ sp _ => (1, s.utf8ByteSize - sp.byteIdx)
154-
| .atEnd => (0, 0)
154+
private def toOption : ForwardSliceSearcher s → Option (Nat × Nat)
155+
| .emptyBefore pos => some (s.utf8ByteSize - pos.offset.byteIdx, 1)
156+
| .emptyAt pos _ => some (s.utf8ByteSize - pos.offset.byteIdx, 0)
157+
| .proper _ _ sp _ => some (s.utf8ByteSize - sp.byteIdx, 0)
158+
| .atEnd => none
155159

156160
private instance : WellFoundedRelation (ForwardSliceSearcher s) where
157-
rel s1 s2 := Prod.Lex (· < ·) (· < ·) s1.toPair s2.toPair
161+
rel := InvImage (Option.lt (Prod.Lex (· < ·) (· < ·))) ForwardSliceSearcher.toOption
158162
wf := by
159163
apply InvImage.wf
164+
apply Option.wellFounded_lt
160165
apply (Prod.lex _ _).wf
161166

162167
private def finitenessRelation :
@@ -168,30 +173,26 @@ private def finitenessRelation :
168173
obtain ⟨step, h, h'⟩ := h
169174
cases step
170175
· cases h
171-
simp only [Std.Iterators.IterM.IsPlausibleStep, Std.Iterators.Iterator.IsPlausibleStep] at h'
172-
split at h'
173-
· next heq =>
174-
rw [heq]
175-
rcases h' with ⟨np, h1', h2'⟩ | h'
176-
· rw [h2']
177-
apply Prod.Lex.right'
178-
· simp
179-
· have haux := np.isValidForSlice.le_utf8ByteSize
180-
simp [Slice.Pos.lt_iff, String.Pos.Raw.le_iff, String.Pos.Raw.lt_iff] at h1' haux ⊢
181-
omega
182-
· apply Prod.Lex.left
183-
simp [h']
184-
· next heq =>
185-
rw [heq]
186-
rcases h' with ⟨np, sp, h1', h2', h3'⟩ | h'
187-
· rw [h3']
188-
apply Prod.Lex.right'
189-
· simp
190-
· simp [String.Pos.Raw.le_iff, String.Pos.Raw.lt_iff] at h1' h2' ⊢
191-
omega
192-
· apply Prod.Lex.left
193-
simp [h']
194-
· contradiction
176+
revert h'
177+
simp only [Std.Iterators.IterM.IsPlausibleStep, Std.Iterators.Iterator.IsPlausibleStep]
178+
match it.internalState with
179+
| .emptyBefore pos =>
180+
rintro (⟨h, h'⟩|h') <;> simp [h', ForwardSliceSearcher.toOption, Option.lt, Prod.lex_def]
181+
| .emptyAt pos h =>
182+
simp only [forall_exists_index, and_imp]
183+
intro x hx h
184+
have := x.isValidForSlice.le_utf8ByteSize
185+
simp [h, ForwardSliceSearcher.toOption, Option.lt, Prod.lex_def, Pos.lt_iff,
186+
Pos.Raw.lt_iff, Pos.Raw.le_iff] at hx ⊢ this
187+
omega
188+
| .proper needle table stackPos needlePos =>
189+
simp only [exists_and_left]
190+
rintro (⟨newStackPos, h₁, h₂, ⟨x, hx⟩⟩|h)
191+
· simp [hx, ForwardSliceSearcher.toOption, Option.lt, Prod.lex_def, Pos.Raw.lt_iff,
192+
Pos.Raw.le_iff] at ⊢ h₁ h₂
193+
omega
194+
· simp [h, ForwardSliceSearcher.toOption, Option.lt]
195+
| .atEnd .. => simp
195196
· cases h'
196197
· cases h
197198

tests/lean/run/string_slice.lean

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,3 +209,25 @@ Tests for `String.Slice` functions
209209

210210
#guard "abc".toSlice.back = 'c'
211211
#guard "".toSlice.back = (default : Char)
212+
213+
section
214+
open String.Slice.Pattern
215+
216+
instance [Monad n]{s : String.Slice} : Std.Iterators.IteratorCollect (ForwardSliceSearcher s) Id n :=
217+
.defaultImplementation
218+
219+
#guard (ToForwardSearcher.toSearcher "".toSlice "").toList == [.matched "".toSlice.startPos "".toSlice.startPos]
220+
#guard (ToForwardSearcher.toSearcher "abc".toSlice "").toList == [
221+
.matched ("abc".toSlice.pos ⟨0⟩ (by decide)) ("abc".toSlice.pos ⟨0⟩ (by decide)),
222+
.rejected ("abc".toSlice.pos ⟨0⟩ (by decide)) ("abc".toSlice.pos ⟨1⟩ (by decide)),
223+
.matched ("abc".toSlice.pos ⟨1⟩ (by decide)) ("abc".toSlice.pos ⟨1⟩ (by decide)),
224+
.rejected ("abc".toSlice.pos ⟨1⟩ (by decide)) ("abc".toSlice.pos ⟨2⟩ (by decide)),
225+
.matched ("abc".toSlice.pos ⟨2⟩ (by decide)) ("abc".toSlice.pos ⟨2⟩ (by decide)),
226+
.rejected ("abc".toSlice.pos ⟨2⟩ (by decide)) ("abc".toSlice.pos ⟨3⟩ (by decide)),
227+
.matched ("abc".toSlice.pos ⟨3⟩ (by decide)) ("abc".toSlice.pos ⟨3⟩ (by decide)),
228+
]
229+
230+
end
231+
232+
#guard ("".toSlice.split "").allowNontermination.toList == ["".toSlice, "".toSlice]
233+
#guard ("abc".toSlice.split "").allowNontermination.toList == ["".toSlice, "a".toSlice, "b".toSlice, "c".toSlice, "".toSlice]

0 commit comments

Comments
 (0)