@@ -128,6 +128,103 @@ let is_in_string = ref false
128
128
let in_string () = ! is_in_string
129
129
let print_warnings = ref true
130
130
131
+ type deferred_token =
132
+ { token : token
133
+ ; start_pos : Lexing .position
134
+ ; end_pos : Lexing .position
135
+ }
136
+
137
+ (* This queue will only ever have 0 or 1 elements in it. We use it
138
+ instead of an [option ref] for its convenient interface.
139
+ *)
140
+ let deferred_tokens : deferred_token Queue.t = Queue. create ()
141
+
142
+ (* Effectively splits the text in the lexer's current "window" (defined below)
143
+ into two halves. The current call to the lexer will return the first half of
144
+ the text in the window, and the next call to the lexer will return the second
145
+ half (of length [len]) of the text in the window.
146
+
147
+ "window" refers to the text matched by a production of the lexer. It spans
148
+ from [lexer.lex_start_p] to [lexer.lex_curr_p].
149
+
150
+ The function accomplishes this splitting by doing two things:
151
+ - It sets the current window of the lexbuf to only account for the
152
+ first half of the text. (The first half is of length: |text|-len.)
153
+ - It enqueues a token into [deferred_tokens] such that, the next time the
154
+ lexer is called, it will return the specified [token] *and* set the window
155
+ of the lexbuf to account for the second half of the text. (The second half
156
+ is of length: |text|.)
157
+
158
+ This business with setting the window of the lexbuf is only so that error
159
+ messages point at the right place in the program text.
160
+ *)
161
+ let enqueue_token_from_end_of_lexbuf_window (lexbuf : Lexing.lexbuf ) token ~len =
162
+ let suffix_end = lexbuf.lex_curr_p in
163
+ let suffix_start =
164
+ { suffix_end with pos_cnum = suffix_end.pos_cnum - len }
165
+ in
166
+ lexbuf.lex_curr_p < - suffix_start;
167
+ Queue. add
168
+ { token; start_pos = suffix_start; end_pos = suffix_end }
169
+ deferred_tokens
170
+
171
+ (* Note [Lexing hack for float#]
172
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
173
+ This note describes a non-backward-compatible Jane Street--internal change to
174
+ the lexer.
175
+
176
+ We want the lexer to lex [float#] differently than [float #]. [float#] is the
177
+ new syntax for the unboxed float type. It veers close to the syntax for the
178
+ type of all objects belonging to a class [c], which is [#c]. The way we
179
+ navigate this veering is by producing the following tokens for these source
180
+ program examples, where LIDENT(s) is an LIDENT with text [s].
181
+
182
+ float#c ==> LIDENT(float) HASH_SUFFIX LIDENT(c)
183
+ float# c ==> LIDENT(float) HASH_SUFFIX LIDENT(c)
184
+ float # c ==> LIDENT(float) HASH LIDENT(c)
185
+ float #c ==> LIDENT(float) HASH LIDENT(c)
186
+
187
+ (A) The parser interprets [LIDENT(float) HASH_SUFFIX LIDENT(c)] as
188
+ "the type constructor [c] applied to the unboxed float type."
189
+ (B) The parser interprets [LIDENT(float) HASH LIDENT(c)] as
190
+ "the type constructor [#c] applied to the usual boxed float type."
191
+
192
+ This is not a backward-compatible change. In upstream ocaml, the lexer
193
+ produces [LIDENT(float) HASH LIDENT(c)] for all the above source programs.
194
+
195
+ But, this isn't problematic: everybody puts a space before '#c' to mean (B).
196
+ No existing code writes things like [float#c] or indeed [float# c].
197
+
198
+ We accomplish this hack by setting some global mutable state upon seeing
199
+ an identifier immediately followed by a hash. When that state is set, we
200
+ will produce [HASH_SUFFIX] the next time the lexer is called. This is
201
+ done in [enqueue_hash_suffix_from_end_of_lexbuf_window].
202
+
203
+ Note [Lexing hack for hash operators]
204
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
205
+ To complicate the above story, we don't want to treat the # in the
206
+ below program as HASH_SUFFIX:
207
+
208
+ x#~#y
209
+
210
+ We instead want:
211
+
212
+ x#~#y ==> LIDENT(x) HASHOP(#~#) LIDENT(y)
213
+
214
+ This is to allow for infix hash operators. We add an additional hack, in
215
+ the style of Note [Lexing hack for float#], where the lexer consumes [x#~#]
216
+ all at once, but produces LIDENT(x) from the current call to the lexer and
217
+ HASHOP(#~#) from the next call to the lexer. This is done in
218
+ [enqueue_hashop_from_end_of_lexbuf_window].
219
+ *)
220
+
221
+ let enqueue_hash_suffix_from_end_of_lexbuf_window lexbuf =
222
+ enqueue_token_from_end_of_lexbuf_window lexbuf HASH_SUFFIX ~len: 1
223
+
224
+ let enqueue_hashop_from_end_of_lexbuf_window lexbuf ~hashop =
225
+ enqueue_token_from_end_of_lexbuf_window lexbuf (HASHOP hashop)
226
+ ~len: (String. length hashop)
227
+
131
228
(* Escaped chars are interpreted in strings unless they are in comments. *)
132
229
let store_escaped_char lexbuf c =
133
230
if in_comment () then store_lexeme lexbuf else store_string_char c
@@ -419,8 +516,33 @@ rule token = parse
419
516
| " ?" (lowercase_latin1 identchar_latin1 * as name) ':'
420
517
{ warn_latin1 lexbuf;
421
518
OPTLABEL name }
519
+ (* Lowercase identifiers are split into 3 cases, and the order matters
520
+ (longest to shortest).
521
+ *)
522
+ | (lowercase identchar * as name) ('#' symbolchar_or_hash+ as hashop)
523
+ (* See Note [Lexing hack for hash operators] *)
524
+ { enqueue_hashop_from_end_of_lexbuf_window lexbuf ~hashop ;
525
+ lookup_keyword name }
526
+ | (lowercase identchar * as name) '#'
527
+ (* See Note [Lexing hack for float#] *)
528
+ { enqueue_hash_suffix_from_end_of_lexbuf_window lexbuf;
529
+ lookup_keyword name }
422
530
| lowercase identchar * as name
423
531
{ lookup_keyword name }
532
+ (* Lowercase latin1 identifiers are split into 3 cases, and the order matters
533
+ (longest to shortest).
534
+ *)
535
+ | (lowercase_latin1 identchar_latin1 * as name)
536
+ ('#' symbolchar_or_hash+ as hashop)
537
+ (* See Note [Lexing hack for hash operators] *)
538
+ { warn_latin1 lexbuf;
539
+ enqueue_hashop_from_end_of_lexbuf_window lexbuf ~hashop ;
540
+ LIDENT name }
541
+ | (lowercase_latin1 identchar_latin1 * as name) '#'
542
+ (* See Note [Lexing hack for float#] *)
543
+ { warn_latin1 lexbuf;
544
+ enqueue_hash_suffix_from_end_of_lexbuf_window lexbuf;
545
+ LIDENT name }
424
546
| lowercase_latin1 identchar_latin1 * as name
425
547
{ warn_latin1 lexbuf; LIDENT name }
426
548
| uppercase identchar * as name
@@ -775,6 +897,13 @@ and skip_hash_bang = parse
775
897
| "" { () }
776
898
777
899
{
900
+ let token lexbuf =
901
+ match Queue.take_opt deferred_tokens with
902
+ | None -> token lexbuf
903
+ | Some { token; start_pos; end_pos } ->
904
+ lexbuf.lex_start_p <- start_pos;
905
+ lexbuf.lex_curr_p <- end_pos;
906
+ token
778
907
779
908
let token_with_comments lexbuf =
780
909
match !preprocessor with
0 commit comments