diff --git a/lib/makeup/lexers/erlang_lexer.ex b/lib/makeup/lexers/erlang_lexer.ex index eb94a3f..4a05f97 100644 --- a/lib/makeup/lexers/erlang_lexer.ex +++ b/lib/makeup/lexers/erlang_lexer.ex @@ -59,24 +59,29 @@ defmodule Makeup.Lexers.ErlangLexer do ]) # Numbers - digits = ascii_string([?0..?9], min: 1) + # + # Erlang/OTP 27 added underscore separators in numeric literals + # (`1_000_000`, `16#FF_FF`, `0.1_5e1_0`). Lexer-tolerant: underscores are + # accepted anywhere inside the digit run; we don't validate position. + digits = ascii_string([?0..?9, ?_], min: 1) number_integer = optional(ascii_char([?+, ?-])) - |> concat(digits) + |> ascii_char([?0..?9]) + |> optional(ascii_string([?0..?9, ?_], min: 1)) |> token(:number_integer) number_integer_in_weird_base = optional(ascii_char([?+, ?-])) |> concat(numeric_base) |> string("#") - |> ascii_string([?0..?9, ?a..?z, ?A..?Z], min: 1) + |> ascii_string([?0..?9, ?a..?z, ?A..?Z, ?_], min: 1) |> token(:number_integer) # Floating point numbers float_scientific_notation_part = ascii_string([?e, ?E], 1) - |> optional(string("-")) + |> optional(ascii_char([?+, ?-])) |> concat(digits) number_float = @@ -91,6 +96,16 @@ defmodule Makeup.Lexers.ErlangLexer do ascii_string([?A..?Z, ?_], 1) |> optional(ascii_string([?a..?z, ?_, ?0..?9, ?A..?Z], min: 1)) + # An underscore followed by at least one identifier character (`_5`, + # `_X`, `_unused`). Bare `_` stays as a punctuation token (the wildcard + # pattern), but `_` is a variable in Erlang grammar and should + # render as `:name`. Without this rule the `_` is matched first by + # the `punctuation` rule and the rest of the identifier falls through. + underscore_identifier = + string("_") + |> ascii_string([?a..?z, ?_, ?0..?9, ?A..?Z], min: 1) + |> token(:name) + simple_atom_name = ascii_string([?a..?z], 1) |> optional(ascii_string([?a..?z, ?_, ?@, ?0..?9, ?A..?Z], min: 1)) @@ -141,6 +156,20 @@ defmodule Makeup.Lexers.ErlangLexer do macro_name = choice([variable_name, atom_name]) + # Parameterised macro reference: `?FOO(arg1, arg2)`. Tokenised + # separately from the parameterless form so themes can render the two + # distinctly (matches `makeup_elixir`'s split between `@foo` and + # `@foo(...)`). The macro head emits as `:name_function`; the trailing + # `(` opens the standard punctuation group so paren matching still + # works. + macro_call = + string("?") + |> concat(macro_name) + |> token(:name_function) + |> concat(optional(whitespace)) + |> concat(token("(", :punctuation)) + + # Parameterless macro: `?FOO`. Constants by convention. macro = string("?") |> concat(macro_name) @@ -152,10 +181,23 @@ defmodule Makeup.Lexers.ErlangLexer do |> optional(string(".") |> concat(atom_name)) |> token(:name_label) + # `$\xFF`, `$\x{1F600}`, `$\077`, `$\^A`, plus simple `$\n` / `$\t` / `$\\` / + # `$\"` / `$\'` etc. The structured escapes (octal, hex, ctrl) must be tried + # before the single-char fallback so multi-character sequences are consumed + # whole. + character_escape = + string("\\") + |> choice([ + escape_hex, + escape_octal, + escape_ctrl, + utf8_char([]) + ]) + character = string("$") |> choice([ - string("\\") |> utf8_char([]), + character_escape, utf8_char(not: ?\\) ]) |> token(:string_char) @@ -166,18 +208,55 @@ defmodule Makeup.Lexers.ErlangLexer do |> ascii_char(to_charlist("~#+BPWXb-ginpswx")) |> token(:string_interpol) - escape_double_quote = string(~s/\\"/) - erlang_string = string_like(~s/"/, ~s/"/, [escape_double_quote, string_interpol], :string) - + # Sub-token emitted inside string literals for escape sequences. Mirrors + # the `character_escape` shape so multi-character escapes (`\xFF`, + # `\x{1F600}`, `\077`, `\^A`) are consumed whole instead of getting + # cut at the first byte. Themes can render these distinctly. escaped_char = string("\\") - |> utf8_string([], 1) + |> choice([ + escape_hex, + escape_octal, + escape_ctrl, + utf8_char([]) + ]) |> token(:string_escape) + erlang_string = string_like(~s/"/, ~s/"/, [escaped_char, string_interpol], :string) + + # Multi-quoted strings (OTP 27+). The opening run of `"""` (or more) on + # its own line opens the string; a matching run on its own line closes + # it. Use a quadruple/quintuple opener when the body needs to contain + # `"""` literally. Each variant is a separate rule because NimbleParsec + # doesn't support dynamic delimiter lengths; longer-quote variants must + # be tried first so the triple-quote rule doesn't claim them prematurely. + quintuple_quoted_string = + lookahead_string( + string(~s/"""""\n/), + string(~s/\n"""""/), + [escaped_char, string_interpol] + ) + + quadruple_quoted_string = + lookahead_string( + string(~s/""""\n/), + string(~s/\n""""/), + [escaped_char, string_interpol] + ) + triple_quoted_string = lookahead_string(string(~s/"""\n/), string(~s/\n"""/), [escaped_char, string_interpol]) + # Longer-quote variants must come first so the longest matching delimiter + # wins for sigils like `~"""""..."""""` (quintuple) or `~""""..."""" ` + # (quadruple) — these are needed when the sigil body has to contain + # `"""` or `""""` literally, mirroring the rule for plain multi-quoted + # strings above. sigil_delimiters = [ + {~s["""""\n], ~s[\n"""""]}, + {"'''''\n", "\n'''''"}, + {~s[""""\n], ~s[\n""""]}, + {"''''\n", "\n''''"}, {~s["""\n], ~s[\n"""]}, {"'''\n", "\n'''"}, {"\"", "\""}, @@ -220,13 +299,47 @@ defmodule Makeup.Lexers.ErlangLexer do syntax_operators = word_from_list( - ~W[+ - +? ++ = == -- * / < > /= =:= =/= =< >= ==? <- <:- <= <:= ! ? ?!], + ~W[+ - +? ++ = == -- * / < > /= =:= =/= =< >= ==? <- <:- <= <:= ! ? ?! ?=], :operator ) + # OTP 29 native records relax the record-name rule: per the spec + # (https://www.erlang.org/doc/system/data_types.html), "it is not necessary + # to quote atoms that look like variable names or keywords." So `#State{}`, + # `#div{}`, `#case{}` are all valid record references even though `State` + # is variable-shape and `div`/`case` are reserved words. Tuple-based records + # don't allow these forms, but the lexer can't tell the two record kinds + # apart from local context — so accept the union. + # + # The `record_name: true` meta marker tells postprocess to skip the + # keyword / builtin / word-operator conversion for this position. Without + # it, `#case{}` would tokenise as `[#, keyword case, {]` — visually + # confusing because `case` here names a record, not an expression keyword. + record_name = + choice([ + token(atom_name, :string_symbol, %{record_name: true}), + token(variable_name, :string_symbol, %{record_name: true}) + ]) + + # External native record construction / pattern / field access: + # #Module:Name{F = V} + # #Module:Name.field + # The `Module:Name` shape between `#` and `{` (or `.`) was added in OTP 29 + # alongside native records. Local construction (`#Name{...}`) is identical + # in shape to a tuple-based record and is handled by the rule below. + native_record_external = + token(string("#"), :operator) + |> concat(token(atom_name, :name_class)) + |> concat(token(":", :punctuation)) + |> concat(record_name) + |> choice([ + token("{", :punctuation), + token(".", :punctuation) + ]) + record = token(string("#"), :operator) - |> concat(atom) + |> concat(record_name) |> choice([ token("{", :punctuation), token(".", :punctuation) @@ -249,10 +362,19 @@ defmodule Makeup.Lexers.ErlangLexer do |> concat(token("/", :punctuation)) |> concat(number_integer) - # Erlang prompt + # Erlang prompt. Anchored to a line boundary by requiring the leading + # whitespace to contain at least one `\n`. The original rule required + # the `\n` immediately before the prompt body, which broke when the + # generic `whitespace` rule had already consumed the trailing `\n` of + # a multi-character whitespace block (see makeup_elixir #28). Allowing + # any leading non-newline whitespace before the `\n` and any further + # whitespace after lets the rule match in those cases without + # false-positiving on `1 > 2` or `x. 1> a.` (neither contains a `\n` + # in the relevant position). erl_prompt = - ascii_string([?\s, ?\r, ?\t], min: 0) + ascii_string([?\s, ?\f, ?\r, ?\t], min: 0) |> string("\n") + |> optional(ascii_string([?\s, ?\f, ?\r, ?\n, ?\t], min: 1)) |> token(:whitespace) |> concat( optional(string("(") |> concat(atom_name) |> string(")")) @@ -299,12 +421,21 @@ defmodule Makeup.Lexers.ErlangLexer do hashbang, whitespace, comment, + quintuple_quoted_string, + quadruple_quoted_string, triple_quoted_string, erlang_string ] ++ all_sigils ++ [ + native_record_external, record, + underscore_identifier, + # Macros must be tried before `syntax_operators`, since the + # operator list contains `?` and `?=` and would otherwise eat the + # leading `?` of `?FOO` / `?FOO(X)`. + macro_call, + macro, punctuation, # `tuple` might be unnecessary tuple, @@ -319,7 +450,6 @@ defmodule Makeup.Lexers.ErlangLexer do function_arity, function, atom, - macro, character, label, # If we can't parse any of the above, we highlight the next character as an error @@ -353,38 +483,45 @@ defmodule Makeup.Lexers.ErlangLexer do @keywords ~W[after begin case catch cond end fun if let of query receive try when maybe else] - @builtins ~W[ - abs append_element apply atom_to_list binary_to_list bitstring_to_list - binary_to_term bit_size bump_reductions byte_size cancel_timer - check_process_code delete_module demonitor disconnect_node display - element erase exit float float_to_list fun_info fun_to_list - function_exported garbage_collect get get_keys group_leader hash - hd integer_to_list iolist_to_binary iolist_size is_atom is_binary - is_bitstring is_boolean is_builtin is_float is_function is_integer - is_list is_number is_pid is_port is_process_alive is_record is_reference - is_tuple length link list_to_atom list_to_binary list_to_bitstring - list_to_existing_atom list_to_float list_to_integer list_to_pid - list_to_tuple load_module localtime_to_universaltime make_tuple - md5 md5_final md5_update memory module_loaded monitor monitor_node - node nodes open_port phash phash2 pid_to_list port_close port_command - port_connect port_control port_call port_info port_to_list - process_display process_flag process_info purge_module put read_timer - ref_to_list register resume_processround send send_after send_nosuspend - set_cookie setelement size spawn spawn_link spawn_monitor spawn_opt - split_binary start_timer statistics suspend_process system_flag - system_info system_monitor system_profile term_to_binary tl trace - trace_delivered trace_info trace_pattern trunc tuple_size tuple_to_list - universaltime_to_localtime unlink unregister whereis - ] + # Auto-imported BIFs, sourced at compile time from `erl_internal:bif/2` — + # the same predicate the Erlang compiler uses to decide what's auto-imported. + # Refreshed every time `makeup_erlang` is rebuilt, so the list stays in sync + # with the OTP version we compile against and never bit-rots. + @builtins :erlang.module_info(:exports) + |> Enum.filter(fn {name, arity} -> :erl_internal.bif(name, arity) end) + |> Enum.map(fn {name, _arity} -> Atom.to_string(name) end) + |> Enum.uniq() + |> Enum.sort() @word_operators ~W[and andalso band bnot bor bsl bsr bxor div not or orelse rem xor] + # Record names tagged by the `record_name` combinator should not be + # reclassified as keywords / builtins / word-operators even if their + # text happens to match. Strip the marker after acting on it so it + # doesn't leak into the rendered output. + defp postprocess_helper([{:string_symbol, %{record_name: true} = meta, value} | tokens]), + do: [{:string_symbol, Map.delete(meta, :record_name), value} | postprocess_helper(tokens)] + defp postprocess_helper([{:string_symbol, meta, value} | tokens]) when value in @keywords, do: [{:keyword, meta, value} | postprocess_helper(tokens)] + # Keywords followed by `(` are first matched by the `function` rule and + # tagged `:name_function`. Recover them here. The most common case is + # `fun(X) -> ... end`; the rule also covers any other keyword that gets + # written next to `(` (e.g. `if(X)` in a teaching example of invalid + # syntax). + defp postprocess_helper([{:name_function, meta, value} | tokens]) when value in @keywords, + do: [{:keyword, meta, value} | postprocess_helper(tokens)] + defp postprocess_helper([{:string_symbol, meta, value} | tokens]) when value in @builtins, do: [{:name_builtin, meta, value} | postprocess_helper(tokens)] + # Same recovery for builtins: when a BIF is called as `length(L)` it is + # first matched by the `function` rule and tagged `:name_function`. Closes + # makeup_erlang #13. + defp postprocess_helper([{:name_function, meta, value} | tokens]) when value in @builtins, + do: [{:name_builtin, meta, value} | postprocess_helper(tokens)] + defp postprocess_helper([{:string_symbol, meta, value} | tokens]) when value in @word_operators, do: [{:operator_word, meta, value} | postprocess_helper(tokens)] diff --git a/test/makeup/erlang_lexer/erlang_lexer_tokenizer_test.exs b/test/makeup/erlang_lexer/erlang_lexer_tokenizer_test.exs index 84ba145..c4baa6b 100644 --- a/test/makeup/erlang_lexer/erlang_lexer_tokenizer_test.exs +++ b/test/makeup/erlang_lexer/erlang_lexer_tokenizer_test.exs @@ -20,6 +20,36 @@ defmodule ErlangLexerTokenizer do assert lex("$🫂") == [{:string_char, %{}, "$🫂"}] end + describe "character escape sequences" do + test "named escapes" do + assert lex("$\\n") == [{:string_char, %{}, "$\\n"}] + assert lex("$\\t") == [{:string_char, %{}, "$\\t"}] + assert lex("$\\\\") == [{:string_char, %{}, "$\\\\"}] + assert lex("$\\\"") == [{:string_char, %{}, "$\\\""}] + end + + test "octal escape" do + assert lex("$\\7") == [{:string_char, %{}, "$\\7"}] + assert lex("$\\07") == [{:string_char, %{}, "$\\07"}] + assert lex("$\\077") == [{:string_char, %{}, "$\\077"}] + end + + test "hex escape (two-digit form)" do + assert lex("$\\xFF") == [{:string_char, %{}, "$\\xFF"}] + assert lex("$\\x4a") == [{:string_char, %{}, "$\\x4a"}] + end + + test "hex escape (braced form)" do + assert lex("$\\x{1F600}") == [{:string_char, %{}, "$\\x{1F600}"}] + assert lex("$\\x{0}") == [{:string_char, %{}, "$\\x{0}"}] + end + + test "control escape" do + assert lex("$\\^A") == [{:string_char, %{}, "$\\^A"}] + assert lex("$\\^z") == [{:string_char, %{}, "$\\^z"}] + end + end + test "comment" do assert lex("%abc") == [{:comment_single, %{}, "%abc"}] assert lex("% abc") == [{:comment_single, %{}, "% abc"}] @@ -50,6 +80,36 @@ defmodule ErlangLexerTokenizer do assert lex("A_b1") == [{:name, %{}, "A_b1"}] end + describe "underscore-prefixed variables" do + test "underscore + digit lexes as a single variable" do + assert lex("_5") == [{:name, %{}, "_5"}] + end + + test "underscore + lowercase lexes as a single variable" do + assert lex("_unused") == [{:name, %{}, "_unused"}] + end + + test "underscore + uppercase lexes as a single variable" do + assert lex("_X") == [{:name, %{}, "_X"}] + end + + test "bare underscore (wildcard) stays as punctuation" do + # Pattern wildcard. Treat as punctuation so themes can render it + # distinctly from a variable name. + assert [ + {:keyword, %{}, "case"}, + {:whitespace, %{}, " "}, + {:name, %{}, "X"}, + {:whitespace, %{}, " "}, + {:keyword, %{}, "of"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "_"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "->"} | _ + ] = lex("case X of _ -> ok end") + end + end + test "function call" do assert lex("f(") == [ {:name_function, %{}, "f"}, @@ -94,6 +154,33 @@ defmodule ErlangLexerTokenizer do assert lex("1.05e12") == [{:number_float, %{}, "1.05e12"}] assert lex("1.05e-6") == [{:number_float, %{}, "1.05e-6"}] assert lex("1.05e-12") == [{:number_float, %{}, "1.05e-12"}] + assert lex("1.05e+6") == [{:number_float, %{}, "1.05e+6"}] + assert lex("1.0e+10") == [{:number_float, %{}, "1.0e+10"}] + end + + # Numeric separators (`_`) are valid inside numeric literals since OTP 27. + test "integers with underscore separators" do + assert lex("1_000") == [{:number_integer, %{}, "1_000"}] + assert lex("1_000_000") == [{:number_integer, %{}, "1_000_000"}] + end + + test "floats with underscore separators" do + assert lex("1_000.5") == [{:number_float, %{}, "1_000.5"}] + assert lex("3.14_15") == [{:number_float, %{}, "3.14_15"}] + end + + test "weird-base integers with underscore separators" do + assert lex("16#FF_FF") == [{:number_integer, %{}, "16#FF_FF"}] + assert lex("2#1010_1010") == [{:number_integer, %{}, "2#1010_1010"}] + end + + test "trailing identifier after a number is not absorbed via underscore" do + # `1_000` is a number; the bare identifier following with whitespace is separate. + assert [ + {:number_integer, %{}, "1_000"}, + {:whitespace, %{}, " "}, + {:name, %{}, "X"} + ] = lex("1_000 X") end end @@ -121,16 +208,53 @@ defmodule ErlangLexerTokenizer do end test "tokenizes escape of double quotes correctly" do - assert [{:string, %{}, ~s/"escape \\"double quote\\""/}] == - lex(~s/"escape \\"double quote\\""/) + # Strings now produce :string_escape sub-tokens for each escape + # sequence (mirroring the triple-quoted-string behaviour and + # `makeup_elixir`). Themes can render escapes distinctly from the + # surrounding string body. + assert [ + {:string, %{}, ~s/"escape /}, + {:string_escape, %{}, ~s/\\"/}, + {:string, %{}, "double quote"}, + {:string_escape, %{}, ~s/\\"/}, + {:string, %{}, "\""} + ] = lex(~s/"escape \\"double quote\\""/) - assert [{:string, %{}, ~s/"\\"quote\\""/}] == lex(~s/"\\"quote\\""/) assert {:string, %{}, ~s/"invalid string\\"/} not in lex(~s/"invalid string\\"/) end test "tokenizes literal escaped characters correctly" do - assert [{:string, %{}, ~s/"\\b"/}] == lex(~s/"\\b"/) - assert [{:string, %{}, ~s/"\\\\b"/}] == lex(~s/"\\\\b"/) + assert [ + {:string, %{}, "\""}, + {:string_escape, %{}, "\\b"}, + {:string, %{}, "\""} + ] = lex(~s/"\\b"/) + + assert [ + {:string, %{}, "\""}, + {:string_escape, %{}, "\\\\"}, + {:string, %{}, "b\""} + ] = lex(~s/"\\\\b"/) + end + + test "tokenizes hex / octal / control escapes inside strings" do + assert [ + {:string, %{}, ~s/"a/}, + {:string_escape, %{}, ~s/\\xFF/}, + {:string, %{}, "b\""} + ] = lex(~s/"a\\xFFb"/) + + assert [ + {:string, %{}, ~s/"a/}, + {:string_escape, %{}, "\\077"}, + {:string, %{}, "b\""} + ] = lex(~s/"a\\077b"/) + + assert [ + {:string, %{}, ~s/"a/}, + {:string_escape, %{}, "\\^A"}, + {:string, %{}, "b\""} + ] = lex(~s/"a\\^Ab"/) end end @@ -404,6 +528,7 @@ defmodule ErlangLexerTokenizer do assert lex("<:-") == [{:operator, %{}, "<:-"}] assert lex("<=") == [{:operator, %{}, "<="}] assert lex("<:=") == [{:operator, %{}, "<:="}] + assert lex("?=") == [{:operator, %{}, "?="}] end test "word operators are tokenized as operator" do @@ -553,6 +678,813 @@ defmodule ErlangLexerTokenizer do end end + describe "maybe expression" do + # `?=` is the maybe-expression match operator added in OTP 25. + test "tokenizes ?= as a single operator inside a maybe block" do + assert lex("maybe X ?= ok end") == [ + {:keyword, %{}, "maybe"}, + {:whitespace, %{}, " "}, + {:name, %{}, "X"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "?="}, + {:whitespace, %{}, " "}, + {:string_symbol, %{}, "ok"}, + {:whitespace, %{}, " "}, + {:keyword, %{}, "end"} + ] + end + end + + describe "builtin (BIF) recognition" do + # The @builtins list is generated at compile time from `erl_internal:bif/2`. + test "atoms that are auto-imported BIFs render as :name_builtin" do + assert [{:name_builtin, %{}, "length"}] = lex("length") + assert [{:name_builtin, %{}, "tuple_size"}] = lex("tuple_size") + end + + test "BIF calls (`name(...)`) render as :name_builtin not :name_function" do + # makeup_erlang #13. Before this fix, `length(L)` rendered as a regular + # function call instead of a builtin. + assert [{:name_builtin, %{}, "length"} | _] = lex("length(L)") + assert [{:name_builtin, %{}, "is_atom"} | _] = lex("is_atom(X)") + assert [{:name_builtin, %{}, "tuple_size"} | _] = lex("tuple_size(T)") + end + + test "post-OTP-19 BIFs are recognised (proves the static list is gone)" do + assert [{:name_builtin, %{}, "map_get"} | _] = lex("map_get(K, M)") + assert [{:name_builtin, %{}, "is_map_key"} | _] = lex("is_map_key(K, M)") + assert [{:name_builtin, %{}, "binary_part"} | _] = lex("binary_part(B, 0, 4)") + assert [{:name_builtin, %{}, "floor"} | _] = lex("floor(X)") + assert [{:name_builtin, %{}, "ceil"} | _] = lex("ceil(X)") + end + + test "module_info and nif_error are not classified as BIFs" do + # Both are exported from `erlang` but neither is auto-imported. + refute Enum.any?(lex("module_info"), &match?({:name_builtin, _, "module_info"}, &1)) + refute Enum.any?(lex("nif_error"), &match?({:name_builtin, _, "nif_error"}, &1)) + end + end + + describe "macros" do + test "parameterless macro tokenizes as :name_constant" do + assert lex("?FOO") == [{:name_constant, %{}, "?FOO"}] + assert lex("?bar") == [{:name_constant, %{}, "?bar"}] + end + + test "parameterised macro head tokenizes as :name_function" do + assert [ + {:name_function, %{}, "?FOO"}, + {:punctuation, _, "("}, + {:name, %{}, "X"}, + {:punctuation, _, ")"} + ] = lex("?FOO(X)") + end + + test "parameterless macro followed by punctuation stays as constant" do + # `?FOO,` shouldn't be lured into the parameterised form. + assert [ + {:name_constant, %{}, "?FOO"}, + {:punctuation, %{}, ","} | _ + ] = lex("?FOO, X") + end + end + + describe "function clauses with guards" do + test "guard with operator and BIF" do + assert [ + {:name_function, %{}, "f"}, + {:punctuation, _, "("}, + {:name, %{}, "X"}, + {:punctuation, _, ")"}, + {:whitespace, %{}, " "}, + {:keyword, %{}, "when"}, + {:whitespace, %{}, " "}, + {:name, %{}, "X"}, + {:whitespace, %{}, " "}, + {:operator, %{}, ">"}, + {:whitespace, %{}, " "}, + {:number_integer, %{}, "0"}, + {:punctuation, %{}, ","}, + {:whitespace, %{}, " "}, + {:name_builtin, %{}, "is_integer"}, + {:punctuation, _, "("}, + {:name, %{}, "X"}, + {:punctuation, _, ")"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "->"} | _ + ] = lex("f(X) when X > 0, is_integer(X) -> X * 2.") + end + + test "guard sequence with `;` (alternative guards)" do + assert lex("f(X) when X < 0; X > 100 -> out_of_range.") == [ + {:name_function, %{}, "f"}, + {:punctuation, %{group_id: "group-1"}, "("}, + {:name, %{}, "X"}, + {:punctuation, %{group_id: "group-1"}, ")"}, + {:whitespace, %{}, " "}, + {:keyword, %{}, "when"}, + {:whitespace, %{}, " "}, + {:name, %{}, "X"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "<"}, + {:whitespace, %{}, " "}, + {:number_integer, %{}, "0"}, + {:punctuation, %{}, ";"}, + {:whitespace, %{}, " "}, + {:name, %{}, "X"}, + {:whitespace, %{}, " "}, + {:operator, %{}, ">"}, + {:whitespace, %{}, " "}, + {:number_integer, %{}, "100"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "->"}, + {:whitespace, %{}, " "}, + {:string_symbol, %{}, "out_of_range"}, + {:punctuation, %{}, "."} + ] + end + + test "guard with word operators (`andalso`, `orelse`)" do + assert lex("f(X) when is_atom(X) andalso X =/= undefined -> ok.") == [ + {:name_function, %{}, "f"}, + {:punctuation, %{group_id: "group-1"}, "("}, + {:name, %{}, "X"}, + {:punctuation, %{group_id: "group-1"}, ")"}, + {:whitespace, %{}, " "}, + {:keyword, %{}, "when"}, + {:whitespace, %{}, " "}, + {:name_builtin, %{}, "is_atom"}, + {:punctuation, %{group_id: "group-2"}, "("}, + {:name, %{}, "X"}, + {:punctuation, %{group_id: "group-2"}, ")"}, + {:whitespace, %{}, " "}, + {:operator_word, %{}, "andalso"}, + {:whitespace, %{}, " "}, + {:name, %{}, "X"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "=/="}, + {:whitespace, %{}, " "}, + {:string_symbol, %{}, "undefined"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "->"}, + {:whitespace, %{}, " "}, + {:string_symbol, %{}, "ok"}, + {:punctuation, %{}, "."} + ] + end + end + + describe "newer comprehensions (OTP 26 / 27)" do + test "list comprehension with strict generator (OTP 27)" do + assert lex("[X || X <:- L]") == [ + {:punctuation, %{group_id: "group-1"}, "["}, + {:name, %{}, "X"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "||"}, + {:whitespace, %{}, " "}, + {:name, %{}, "X"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "<:-"}, + {:whitespace, %{}, " "}, + {:name, %{}, "L"}, + {:punctuation, %{group_id: "group-1"}, "]"} + ] + end + + test "map comprehension (OTP 26)" do + # `#{K => V * 2 || K := V <- M}` exercises map-open `\#{`, + # map arrow `=>`, comprehension separator `||`, map match + # operator `:=`, and the list-generator operator `<-`. + assert lex("\#{K => V * 2 || K := V <- M}") == [ + {:punctuation, %{group_id: "group-1"}, "\#{"}, + {:name, %{}, "K"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "=>"}, + {:whitespace, %{}, " "}, + {:name, %{}, "V"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "*"}, + {:whitespace, %{}, " "}, + {:number_integer, %{}, "2"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "||"}, + {:whitespace, %{}, " "}, + {:name, %{}, "K"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, ":="}, + {:whitespace, %{}, " "}, + {:name, %{}, "V"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "<-"}, + {:whitespace, %{}, " "}, + {:name, %{}, "M"}, + {:punctuation, %{group_id: "group-1"}, "}"} + ] + end + + test "bitstring comprehension with `<=` generator" do + # `<<>>` brackets, the bitstring-generator operator `<=`, and + # nested `<<>>` patterns inside. + assert lex("<< <> || <> <= Bin >>") == [ + {:punctuation, %{group_id: "group-1"}, "<<"}, + {:whitespace, %{}, " "}, + {:punctuation, %{group_id: "group-2"}, "<<"}, + {:name, %{}, "X"}, + {:punctuation, %{}, ":"}, + {:number_integer, %{}, "8"}, + {:punctuation, %{group_id: "group-2"}, ">>"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "||"}, + {:whitespace, %{}, " "}, + {:punctuation, %{group_id: "group-3"}, "<<"}, + {:name, %{}, "X"}, + {:punctuation, %{}, ":"}, + {:number_integer, %{}, "8"}, + {:punctuation, %{group_id: "group-3"}, ">>"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "<="}, + {:whitespace, %{}, " "}, + {:name, %{}, "Bin"}, + {:whitespace, %{}, " "}, + {:punctuation, %{group_id: "group-1"}, ">>"} + ] + end + end + + describe "real-world module fragment (integration)" do + # Exercises module attribute, doc string, function head with guard, + # body with map, BIF call, and a record. If any rule's choice order + # gets perturbed, this is the test most likely to catch it. + test "small module with -doc, guard, map, and BIF call" do + src = """ + + -module(positives). + -export([keep/1]). + + -doc \"\"\" + Keep map entries whose values are positive integers. + \"\"\". + keep(M) when is_map(M) -> + \#{K => V || K := V <- M, is_integer(V), V > 0}. + """ + + assert lex(src) == [ + {:whitespace, %{}, "\n"}, + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "module"}, + {:punctuation, %{group_id: "group-1"}, "("}, + {:string_symbol, %{}, "positives"}, + {:punctuation, %{group_id: "group-1"}, ")"}, + {:punctuation, %{}, "."}, + {:whitespace, %{}, "\n"}, + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "export"}, + {:punctuation, %{group_id: "group-2"}, "("}, + {:punctuation, %{group_id: "group-3"}, "["}, + {:string_symbol, %{}, "keep"}, + {:punctuation, %{}, "/"}, + {:number_integer, %{}, "1"}, + {:punctuation, %{group_id: "group-3"}, "]"}, + {:punctuation, %{group_id: "group-2"}, ")"}, + {:punctuation, %{}, "."}, + {:whitespace, %{}, "\n"}, + {:whitespace, %{}, "\n"}, + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "doc"}, + {:whitespace, %{}, " "}, + {:string, %{}, + "\"\"\"\nKeep map entries whose values are positive integers.\n\"\"\""}, + {:punctuation, %{}, "."}, + {:whitespace, %{}, "\n"}, + {:name_function, %{}, "keep"}, + {:punctuation, %{group_id: "group-4"}, "("}, + {:name, %{}, "M"}, + {:punctuation, %{group_id: "group-4"}, ")"}, + {:whitespace, %{}, " "}, + {:keyword, %{}, "when"}, + {:whitespace, %{}, " "}, + {:name_builtin, %{}, "is_map"}, + {:punctuation, %{group_id: "group-5"}, "("}, + {:name, %{}, "M"}, + {:punctuation, %{group_id: "group-5"}, ")"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "->"}, + {:whitespace, %{}, "\n "}, + {:punctuation, %{group_id: "group-6"}, "\#{"}, + {:name, %{}, "K"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "=>"}, + {:whitespace, %{}, " "}, + {:name, %{}, "V"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "||"}, + {:whitespace, %{}, " "}, + {:name, %{}, "K"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, ":="}, + {:whitespace, %{}, " "}, + {:name, %{}, "V"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "<-"}, + {:whitespace, %{}, " "}, + {:name, %{}, "M"}, + {:punctuation, %{}, ","}, + {:whitespace, %{}, " "}, + {:name_builtin, %{}, "is_integer"}, + {:punctuation, %{group_id: "group-7"}, "("}, + {:name, %{}, "V"}, + {:punctuation, %{group_id: "group-7"}, ")"}, + {:punctuation, %{}, ","}, + {:whitespace, %{}, " "}, + {:name, %{}, "V"}, + {:whitespace, %{}, " "}, + {:operator, %{}, ">"}, + {:whitespace, %{}, " "}, + {:number_integer, %{}, "0"}, + {:punctuation, %{group_id: "group-6"}, "}"}, + {:punctuation, %{}, "."}, + {:whitespace, %{}, "\n"} + ] + end + end + + describe "fun keyword vs function call" do + test "fun(X) -> ... end tokenizes `fun` as keyword, not function name" do + assert [ + {:keyword, %{}, "fun"}, + {:punctuation, _, "("}, + {:name, %{}, "X"}, + {:punctuation, _, ")"} | _ + ] = lex("fun(X) -> X end") + end + + test "fun mod:func/2 still tokenizes correctly" do + assert [ + {:keyword, %{}, "fun"}, + {:whitespace, %{}, " "}, + {:name_class, %{}, "mod"}, + {:punctuation, %{}, ":"}, + {:string_symbol, %{}, "func"}, + {:punctuation, %{}, "/"}, + {:number_integer, %{}, "2"} + ] = lex("fun mod:func/2") + end + end + + # https://www.erlang.org/doc/system/data_types.html#sigil + describe "sigil delimiters (OTP 27 spec coverage)" do + # Pair delimiters: () [] {} <> + test "pair delimiters" do + for {open, close} <- [{"(", ")"}, {"[", "]"}, {"{", "}"}, {"<", ">"}] do + src = "~b" <> open <> "hi" <> close + + assert [{:string, %{}, ^src}] = lex(src), + "expected ~b#{open}hi#{close} to lex as a single :string" + end + end + + # Symmetric delimiters: / | ' " ` # + test "symmetric delimiters" do + for delim <- ["/", "|", "'", "\"", "`", "#"] do + src = "~b" <> delim <> "hi" <> delim + + assert [{:string, %{}, ^src}] = lex(src), + "expected ~b#{delim}hi#{delim} to lex as a single :string" + end + end + + test "triple-quote and triple-single-quote" do + assert [{:string, %{}, "~b\"\"\"\nhi\n\"\"\""}] = + lex("~b\"\"\"\nhi\n\"\"\"") + + assert [{:string, %{}, "~b'''\nhi\n'''"}] = + lex("~b'''\nhi\n'''") + end + + test "all sigil prefix kinds (~ ~b ~B ~s ~S) work with the same delimiters" do + for prefix <- ["~", "~b", "~B", "~s", "~S"] do + src = prefix <> "/hi/" + + assert [{:string, %{}, ^src}] = lex(src), + "expected #{prefix}/hi/ to lex as a single :string" + end + end + end + + describe "multi-quoted strings (OTP 27+)" do + test "triple-quoted string lexes as a single :string" do + assert [{:string, %{}, "\"\"\"\nfoo\n\"\"\""}] = lex("\"\"\"\nfoo\n\"\"\"") + end + + test "quadruple-quoted string lexes as a single :string" do + assert [{:string, %{}, "\"\"\"\"\nfoo\n\"\"\"\""}] = + lex("\"\"\"\"\nfoo\n\"\"\"\"") + end + + test "quadruple-quoted string can contain triple quotes in its body" do + # The whole point of using a quadruple opener: lets the body include + # `"""` literally without ending the string. + assert [{:string, %{}, body}] = + lex("\"\"\"\"\nhello \"\"\" inside\n\"\"\"\"") + + assert body =~ "\"\"\"" + end + + test "quintuple-quoted string can contain quadruple quotes in its body" do + assert [{:string, %{}, body}] = + lex("\"\"\"\"\"\nhi \"\"\"\" foo\n\"\"\"\"\"") + + assert body =~ "\"\"\"\"" + end + + test "escape sub-tokens still emitted inside quadruple-quoted strings" do + assert [ + {:string, %{}, "\"\"\"\"\nhi "}, + {:string_escape, %{}, "\\xFF"}, + {:string, %{}, " there\n\"\"\"\""} + ] = lex("\"\"\"\"\nhi \\xFF there\n\"\"\"\"") + end + + test "sigil prefixes work with quadruple-quoted strings" do + assert [{:string, %{}, "~b\"\"\"\"\nfoo\n\"\"\"\""}] = + lex("~b\"\"\"\"\nfoo\n\"\"\"\"") + + assert [{:string, %{}, "~B\"\"\"\"\nhello \"\"\" inside\n\"\"\"\""}] = + lex("~B\"\"\"\"\nhello \"\"\" inside\n\"\"\"\"") + + assert [{:string, %{}, "~\"\"\"\"\nhi\n\"\"\"\""}] = + lex("~\"\"\"\"\nhi\n\"\"\"\"") + end + end + + describe "doc / moduledoc attributes (OTP 27+)" do + test "moduledoc with triple-quoted body" do + src = "-moduledoc \"\"\"\nThis module does X.\n\"\"\"" + + assert [ + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "moduledoc"}, + {:whitespace, %{}, " "}, + {:string, %{}, "\"\"\"\nThis module does X.\n\"\"\""} + ] = lex(src) + end + + test "doc attribute followed by a function definition" do + src = "-doc \"\"\"\nReturns true if X is positive.\n\"\"\".\nis_pos(X) when X > 0 -> true." + + assert lex(src) == [ + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "doc"}, + {:whitespace, %{}, " "}, + {:string, %{}, "\"\"\"\nReturns true if X is positive.\n\"\"\""}, + {:punctuation, %{}, "."}, + {:whitespace, %{}, "\n"}, + {:name_function, %{}, "is_pos"}, + {:punctuation, %{group_id: "group-1"}, "("}, + {:name, %{}, "X"}, + {:punctuation, %{group_id: "group-1"}, ")"}, + {:whitespace, %{}, " "}, + {:keyword, %{}, "when"}, + {:whitespace, %{}, " "}, + {:name, %{}, "X"}, + {:whitespace, %{}, " "}, + {:operator, %{}, ">"}, + {:whitespace, %{}, " "}, + {:number_integer, %{}, "0"}, + {:whitespace, %{}, " "}, + {:punctuation, %{}, "->"}, + {:whitespace, %{}, " "}, + {:string_symbol, %{}, "true"}, + {:punctuation, %{}, "."} + ] + end + + test "doc with single-line string body still works" do + src = "-doc \"short\"." + + assert [ + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "doc"}, + {:whitespace, %{}, " "}, + {:string, %{}, "\"short\""}, + {:punctuation, %{}, "."} + ] = lex(src) + end + end + + describe "OTP-current module attribute coverage" do + # The generic `module_attribute` rule accepts any `atom_name`, which + # means new attributes ship without lexer changes. Lock the current + # OTP-supported set with an explicit assertion list so the rule + # keeps covering them. + @known_attributes ~w[module export import behaviour behavior callback + optional_callbacks on_load nifs deprecated removed + feature compile export_type record export_record + import_record spec type opaque doc moduledoc define + ifdef ifndef else endif if elif vsn] + + test "every current OTP module attribute lexes as :name_attribute" do + for attr <- @known_attributes do + # Use `(Body)` so the body is one well-known token. The point of + # the test is the attribute name, not the body shape. + expected = [ + {:whitespace, %{}, "\n"}, + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, attr}, + {:punctuation, %{group_id: "group-1"}, "("}, + {:name, %{}, "Body"}, + {:punctuation, %{group_id: "group-1"}, ")"} + ] + + actual = lex("\n-" <> attr <> "(Body)") + + assert actual == expected, + "expected -#{attr} to lex as :name_attribute\n" <> + "expected: #{inspect(expected)}\n" <> + "actual: #{inspect(actual)}" + end + end + end + + describe "native records (OTP 29)" do + test "tokenizes external native record construction" do + assert [ + {:operator, %{}, "#"}, + {:name_class, %{}, "vector_lib"}, + {:punctuation, %{}, ":"}, + {:string_symbol, %{}, "vector"}, + {:punctuation, %{}, "{"} | _ + ] = lex("#vector_lib:vector{x = 1.0, y = 2.0}") + end + + test "tokenizes external native record print form" do + assert [ + {:operator, %{}, "#"}, + {:name_class, %{}, "example"}, + {:punctuation, %{}, ":"}, + {:string_symbol, %{}, "pair"}, + {:punctuation, %{}, "{"} | _ + ] = lex("#example:pair{a = 1, b = 2}") + end + + test "tokenizes external native record field access" do + assert [ + {_, %{}, "X"}, + {:operator, %{}, "#"}, + {:name_class, %{}, "vector_lib"}, + {:punctuation, %{}, ":"}, + {:string_symbol, %{}, "vector"}, + {:punctuation, %{}, "."} | _ + ] = lex("X#vector_lib:vector.x") + end + + test "tokenizes local native record construction the same as tuple-based records" do + assert [ + {:operator, %{}, "#"}, + {:string_symbol, %{}, "pair"}, + {:punctuation, %{}, "{"} | _ + ] = lex("#pair{a = 1, b = 2}") + end + + test "tokenizes -record #Name{...} native definition attribute" do + tokens = lex("\n-record #pair{a, b}.") + assert {:name_attribute, %{}, "record"} in tokens + assert {:operator, %{}, "#"} in tokens + assert {:string_symbol, %{}, "pair"} in tokens + end + + test "tokenizes -export_record attribute" do + assert [ + {:whitespace, %{}, "\n"}, + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "export_record"} | _ + ] = lex("\n-export_record([vector, position]).") + end + + test "tokenizes -import_record attribute" do + assert [ + {:whitespace, %{}, "\n"}, + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "import_record"} | _ + ] = lex("\n-import_record(vector_lib, [vector, position]).") + end + + test "does not break the existing local-record rule when there is no `:`" do + tokens = lex("X#name{f = 1}") + assert {:operator, %{}, "#"} in tokens + assert {:string_symbol, %{}, "name"} in tokens + refute Enum.any?(tokens, fn t -> match?({:name_class, _, _}, t) end) + end + + test "external native record pattern match" do + assert [ + {:operator, %{}, "#"}, + {:name_class, %{}, "mod"}, + {:punctuation, %{}, ":"}, + {:string_symbol, %{}, "name"}, + {:punctuation, _, "{"}, + {:string_symbol, %{}, "f"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "="}, + {:whitespace, %{}, " "}, + {:name, %{}, "X"}, + {:punctuation, _, "}"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "="}, + {:whitespace, %{}, " "}, + {:name, %{}, "Y"} + ] = lex("#mod:name{f = X} = Y") + end + + test "external native record update via prefixed variable" do + assert [ + {:name, %{}, "Y"}, + {:operator, %{}, "#"}, + {:name_class, %{}, "mod"}, + {:punctuation, %{}, ":"}, + {:string_symbol, %{}, "name"}, + {:punctuation, _, "{"} | _ + ] = lex("Y#mod:name{f = 2}") + end + + # Native records relax the record-name rule: + # https://www.erlang.org/doc/system/data_types.html says "it is not + # necessary to quote atoms that look like variable names or keywords." + # So `#State{}`, `#div{}`, `#case{}` are all valid. + test "variable-shape name (`#State{}`)" do + assert [ + {:operator, %{}, "#"}, + {:string_symbol, %{}, "State"}, + {:punctuation, _, "{"}, + {:string_symbol, %{}, "x"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "="}, + {:whitespace, %{}, " "}, + {:number_integer, %{}, "1"}, + {:punctuation, _, "}"} + ] = lex("#State{x = 1}") + end + + test "external native record with variable-shape name" do + assert [ + {:operator, %{}, "#"}, + {:name_class, %{}, "mod"}, + {:punctuation, %{}, ":"}, + {:string_symbol, %{}, "State"}, + {:punctuation, _, "{"} | _ + ] = lex("#mod:State{x = 1}") + end + + # Keyword and word-operator names stay as `:string_symbol` in record + # position. Postprocess sees the `record_name: true` meta marker and + # skips the usual conversion to `:keyword` / `:operator_word`, so the + # surrounding `#...{` shape renders consistently regardless of whether + # the name happens to be a reserved word. + test "keyword name (`#case{}`) stays as :string_symbol" do + assert [ + {:operator, %{}, "#"}, + {:string_symbol, %{}, "case"}, + {:punctuation, _, "{"} | _ + ] = lex("#case{x = 1}") + end + + test "keyword name (`#fun{}`)" do + assert [ + {:operator, %{}, "#"}, + {:string_symbol, %{}, "fun"}, + {:punctuation, _, "{"} | _ + ] = lex("#fun{f = g}") + end + + test "word-operator name (`#div{}`)" do + assert [ + {:operator, %{}, "#"}, + {:string_symbol, %{}, "div"}, + {:punctuation, _, "{"} | _ + ] = lex("#div{class}") + end + + test "external native record with keyword name (`#mod:case{}`)" do + assert [ + {:operator, %{}, "#"}, + {:name_class, %{}, "mod"}, + {:punctuation, %{}, ":"}, + {:string_symbol, %{}, "case"}, + {:punctuation, _, "{"} | _ + ] = lex("#mod:case{x = 1}") + end + + test "quoted-atom record name (`#'42'{}`)" do + assert [ + {:operator, %{}, "#"}, + {:string_symbol, %{}, "'42'"}, + {:punctuation, _, "{"} | _ + ] = lex("#'42'{}") + end + + # Declaration syntax: `-record #Name{...}.` (no parens around the name). + # This is the OTP 29 native-record definition form, distinct from the + # tuple-based `-record(name, {...}).` form. The same name flexibility + # (lowercase / variable-shape / keyword / quoted) applies. + test "definition with lowercase name" do + assert lex("\n-record #pair{a, b}.") == [ + {:whitespace, %{}, "\n"}, + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "record"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "#"}, + {:string_symbol, %{}, "pair"}, + {:punctuation, %{group_id: "group-1"}, "{"}, + {:string_symbol, %{}, "a"}, + {:punctuation, %{}, ","}, + {:whitespace, %{}, " "}, + {:string_symbol, %{}, "b"}, + {:punctuation, %{group_id: "group-1"}, "}"}, + {:punctuation, %{}, "."} + ] + end + + test "definition with variable-shape name (`-record #State{x}.`)" do + assert lex("\n-record #State{x}.") == [ + {:whitespace, %{}, "\n"}, + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "record"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "#"}, + {:string_symbol, %{}, "State"}, + {:punctuation, %{group_id: "group-1"}, "{"}, + {:string_symbol, %{}, "x"}, + {:punctuation, %{group_id: "group-1"}, "}"}, + {:punctuation, %{}, "."} + ] + end + + test "definition with keyword name (`-record #div{class}.`)" do + assert lex("\n-record #div{class}.") == [ + {:whitespace, %{}, "\n"}, + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "record"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "#"}, + {:string_symbol, %{}, "div"}, + {:punctuation, %{group_id: "group-1"}, "{"}, + {:string_symbol, %{}, "class"}, + {:punctuation, %{group_id: "group-1"}, "}"}, + {:punctuation, %{}, "."} + ] + end + + test "definition with quoted name (`-record #'42'{}.`)" do + assert lex("\n-record #'42'{}.") == [ + {:whitespace, %{}, "\n"}, + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "record"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "#"}, + {:string_symbol, %{}, "'42'"}, + {:punctuation, %{group_id: "group-1"}, "{"}, + {:punctuation, %{group_id: "group-1"}, "}"}, + {:punctuation, %{}, "."} + ] + end + + test "the record_name meta marker does not leak into output tokens" do + # Postprocess strips the marker after acting on it. End-to-end the + # token's meta should be the same as for any other :string_symbol. + [_, {:string_symbol, meta_kw, "case"} | _] = lex("#case{x = 1}") + [_, {:string_symbol, meta_lc, "vector"} | _] = lex("#vector{x = 1}") + assert meta_kw == meta_lc + refute Map.has_key?(meta_kw, :record_name) + end + + test "definition with default values" do + # `-record #vector{x = 0.0, y = 0.0}.` — the OTP 29 spec example. + assert lex("\n-record #vector{x = 0.0, y = 0.0}.") == [ + {:whitespace, %{}, "\n"}, + {:punctuation, %{}, "-"}, + {:name_attribute, %{}, "record"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "#"}, + {:string_symbol, %{}, "vector"}, + {:punctuation, %{group_id: "group-1"}, "{"}, + {:string_symbol, %{}, "x"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "="}, + {:whitespace, %{}, " "}, + {:number_float, %{}, "0.0"}, + {:punctuation, %{}, ","}, + {:whitespace, %{}, " "}, + {:string_symbol, %{}, "y"}, + {:whitespace, %{}, " "}, + {:operator, %{}, "="}, + {:whitespace, %{}, " "}, + {:number_float, %{}, "0.0"}, + {:punctuation, %{group_id: "group-1"}, "}"}, + {:punctuation, %{}, "."} + ] + end + end + describe "function_arity" do test "is tokenized correctly for the syntax function_name/arity" do assert [ @@ -615,6 +1547,19 @@ defmodule ErlangLexerTokenizer do ] end + # makeup_elixir #28 analogue. The whitespace rule used to consume + # multi-line whitespace blocks greedily, leaving no `\n` for the prompt + # rule to anchor against. The prompt rule now matches any leading + # whitespace block that contains a `\n`. + test "is detected after a multi-line whitespace block" do + assert [ + {:whitespace, %{}, "\n \n"}, + {:generic_prompt, %{selectable: false}, "1> "}, + {:string_symbol, %{}, "ok"}, + {:punctuation, %{}, "."} + ] = lex("\n \n1> ok.") + end + test "with newlines" do assert lex("x. 1> a.") == [ {:string_symbol, %{}, "x"}, @@ -692,7 +1637,7 @@ defmodule ErlangLexerTokenizer do *** argument 1: not an iolist term """) == [ {:generic_prompt, %{selectable: false}, "1> "}, - {:name_function, %{}, "list_to_binary"}, + {:name_builtin, %{}, "list_to_binary"}, {:punctuation, %{group_id: "group-1"}, "("}, {:punctuation, %{group_id: "group-2"}, "<<"}, {:punctuation, %{group_id: "group-2"}, ">>"},