Модуль:Pegex
Pegex — модуль, преобразующий регулярное выражение в стиле Perl (с ограничениями) в LPeg.
Автор — Chris Emerson [email protected], 2014.
Источник — https://github.com/jugglerchris/ta-regex/blob/master/pegex.lua.
Алгоритм разработан Роберту Иерусалимски, Марсело Оикава и Анной Лусией де Моура (Marcelo Oikawa, Roberto Ierusalimschy, Anna Lúcia de Moura «Converting regexes to Parsing Expression Grammars» // Departamento de Informática, PUC-Rio.).
Порядок вызова из модулей «Традиции»:
local pat = require 'Module:pegex'.compile '(?:foo|bar)+' -- это грамматика LPEG.
local result1 = pat:match 'asdfoo' -- возвратит { _start=4, _end=6}.
local result2 = pat:match 'asdf' -- возвратит nil (образец не найден).
ta-regex/Pegex (оригинальная страница справки)
Pegex is a regular expression (regexp) implementation built on top of LPeg.
The original motivation was to add regular expression search support for the Textadept editor; however the underlying engine is generic.
This module replaces the default text search with one which uses regular expressions.
Currently the full regular expressions are supported (not including eg Perl extensions, though some are planned); this is more than the subset supported natively in Textadept (which eg don't include "|").
Syntax | Matches |
---|---|
. | Any character except newline |
[abA-Z] | The characters a,b, or any capital letter |
\< | Zero-length, matches just before the start of a word |
\> | Zero-length, matches just after the end of a word |
bar | Match foo or bar |
(pat) | Match the same as pat (subgroup) |
(?:pat) | Match the same as pat (subgroup), non-capturing |
x* | Match zero or more x |
x+ | match one or more x |
x? | match zero or one x |
\x | . : match the character x |
\w | Any "word" character [a-zA-Z_] |
\W | Any non-"word" character [^a-zA-Z_] |
\d | Any digit character [0-9] |
\D | Any non-digit character [^0-9] |
\s | Any whitespace character [ \t\n\v\r] |
\S | Any non-whitespace character [^\t\n\v\r] |
\1 … \9 | Back reference to Nth (subgroup) |
Installation
To install Pegex standalone, use "luarocks install pegex". Example usage:
local pegex = require('pegex')
pat = pegex.compile('(?:foo|bar)+')
result = pat:match("asdfoo") -- returns { _start=4, _end=6}
result = pat:match("asdf") -- returns nil (not found)
See the tests for examples using captures and backreferences.
To use with Textadept to replace the default search method:
Add the ta-regex directory to ~/.textadept/modules/
Add the following line to ~/.textadept/init.lua
local ta_regex = require 'ta-regex'
ta_regex.install()
Internal details
The module adds a handler for events.FIND to intercept searches. Regular expressions are converted to equivalent LPEG patterns, which are then used for searching the text.
The regex-to-LPEG conversion can be used independently.
См. также
-- Copyright (C) 2014 Chris Emerson <[email protected]>
-- See LICENSE for details (MIT license).
-- Support for regular expressions (parsed and implemented with LPeg).
local M = {}
local lpeg = lpeg or require('lpeg')
local P = lpeg.P
local R = lpeg.R
local S = lpeg.S
local C = lpeg.C
local V = lpeg.V
local B = lpeg.B
local Carg = lpeg.Carg
local Cb = lpeg.Cb
local Cc = lpeg.Cc
local Cf = lpeg.Cf
local Cp = lpeg.Cp
local Cg = lpeg.Cg
local Ct = lpeg.Ct
local Cmt = lpeg.Cmt
-- We use the algorithm to convert from a regular expression to a Peg
-- expression from:
-- http://www.inf.puc-rio.br/~roberto/docs/ry10-01.pdf
local function sub1(x)
return x - 1
end
-- Parts of a regular expression, returning an LPEG pattern which matches it.
local _start = Cg(Cp(), "_start")
local _end = Cg(Cp()/sub1, "_end")
local mt = {
__index = {
match = function(t, s, index)
local result = t._pat:match(s, index)
if result == nil then return result end
-- Post-process to put the matches into a nicer form
local groups = nil
for k,v in pairs(result) do
if k:sub(1,1) == "s" then
local grpname= k:sub(2)
local endpos = result["e"..grpname]
if v and endpos then
if grpname:match("(%d+)") then
grpname = tonumber(grpname)
end
groups = groups or {}
groups[grpname] = {v,endpos}
result[k] = nil
result["e"..grpname] = nil
end
end
end
result.groups = groups
return result
end,
},
}
-- Make special character sets
local function make_b_s()
return { { " \t\v\n\r", [0]="set" }, [0]="charset" }
end
local function make_b_S()
return { { " \t\v\n\r", [0]="set" }, [0]="charset",
negate=true}
end
local function make_b_w()
return { [0]="charset",
{ [0]="range", "a", "z" },
{ [0]="range", "A", "Z" },
{ [0]="range", "0", "9" },
{ [0]="char", "_" },
}
end
local function make_b_W()
return { [0]="charset",
{ [0]="range", "a", "z" },
{ [0]="range", "A", "Z" },
{ [0]="range", "0", "9" },
{ [0]="char", "_" },
negate=true
}
end
local function make_b_d()
return { [0]="charset",
{ [0]="range", "0", "9" },
}
end
local function make_b_D()
return { [0]="charset",
{ [0]="range", "0", "9" },
negate=true,
}
end
local function make_charset(c)
return function() return { [0]="charset", { [0]="char", c } } end
end
local function make_char(c)
return function() return { [0]="char", c } end
end
local special = S"()\\?*+|.^$"
local any = P"." * Cc({[0] = "."})
-- Perl-style character classes
local b_s = P"\\s" / make_b_s
local b_S = P"\\S" / make_b_S
local b_w = P"\\w" / make_b_w
local b_W = P"\\W" / make_b_W
local b_d = P"\\d" / make_b_d
local b_D = P"\\D" / make_b_D
local b_t = P"\\t" / make_charset('\t')
local b_n = P"\\n" / make_charset('\n')
local b_r = P"\\r" / make_charset('\r')
local b_f = P"\\f" / make_charset('\f')
local b_e = P"\\e" / make_charset('\x1b')
local b_a = P"\\a" / make_charset('\x07')
local backcharset = b_s + b_S + b_w + b_W + b_d + b_D +
b_t + b_n + b_r + b_f + b_e + b_a
local charset_special = S"]-"
local charset_escapes = (b_t + b_n + b_r + b_f + b_e + b_a) /
function(c) return c[1] end
local charset_char = C(P(1) - charset_special) /
function(c) return { [0] = "char", c } end
local range = (C(P(1) - charset_special) * P"-" * C(P(1) - charset_special)) /
function(a,b) return { [0]="range", a, b } end
local charset = (P"[" *
Ct((Cg(P"^"*Cc(true), "negate") + P(0))
* (range + charset_escapes + charset_char)^0) *
P"]") /
function(x) x[0] = "charset" return x end
local char = C(P(1) - special) / function(c) return { [0] = "char", c } end
local escapechar = (P"\\" * C(special)) / function(c) return { [0] = "char", c } end
local backref = (P"\\" * C(R"19")) / function(c) return { tonumber(c), [0] = "backref" } end
local wordchar = R("AZ", "az", "09") + S("_")
local nonwordchar = 1 - wordchar
-- word boundaries
local word_start = P"\\<" * Cc({[0] = "\\<"})
local word_end = P"\\>" * Cc({[0] = "\\>"})
-- {n} etc. Returns two captures - (min, max); max can be nil (no max)
local count_exact = (P"{" * C(R"09" ^ 1) * P"}") / function(c) return tonumber(c), tonumber(c) end
local count_minmax = (P"{" * C(R"09" ^ 1) * P"," * C(R"09" ^ 1) * P"}") / function(min,max) return tonumber(min), tonumber(max) end
local count_min = (P"{" * C(R"09" ^ 1) * P",}") / function(c) return tonumber(c), nil end
local brace_count = count_exact + count_minmax + count_min
-- Grouping
local newgrp = (Cb("groups") * Cp()) /
function(groups, pos)
local grp = #groups+1
groups[grp] = {pos}
groups.open[#groups.open+1] = grp
end
-- endgrp leaves the group number or name as a capture
local endgrp = (Cb("groups") * Cp()) /
function(groups, pos)
local grp = groups.open[#groups.open]
groups.open[#groups.open] = nil
groups[grp][2] = pos
return grp
end
local bra = P"(" * newgrp
local ket = P")" * endgrp
local anonbra = P"(?:"
local anonket = P")"
local pattern = P{
"pattern",
-- A complete pattern, starting from an empty pattern.
pattern = Cg(Carg(1),"groups") * Ct((P"^"*Cg(Cc(1),"anchorstart") + P(0)) * V"subpat" * (P"$"*(-P(1))*Cg(Cc(1),"anchorend") + (-P(1)))) /
function(t) t[0] = "pattern" ; return t end,
-- A set of alternate branches
subpat = (V"branch" * (P"|" * V"branch") ^ 0) /
function(...) return { [0] = "alt", ... } end,
branch = V"concat",
-- A set of concatenated pieces
-- Pass a dummy capture to avoid the special case of no captures confusing
-- the function.
concat = Cc(nil) * (V"piece" ^ 0) /
function(_, ...) return { [0] = "concat", ... } end,
piece = V"atom_multi",
atom_multi = V"atom_plus" + V"atom_star" + V"atom_query" + V"atom_count" + V"atom",
atom_plus = (V"atom" * P"+") /
function(atom) return { [0] = "+", atom } end,
atom_star = (V"atom" * P"*") /
function(atom) return { [0] = "*", atom } end,
atom_query = (V"atom" * P"?") /
function(atom) return { [0] = "?", atom } end,
atom_count = (V"atom" * brace_count) /
function(atom, min, max) return { [0] = "{}", min=min, max=max, atom } end,
anongroup = (anonbra * V"subpat" * anonket),
group = (bra * V"subpat" * ket) /
function(subpat, grpname) return { [0] = "group", subpat, grpname } end,
atom = any + word_start + word_end + escapechar + charset + V"anongroup" + V"group" + char + backref + backcharset,
}
local function foldr(f, t, init)
local res = init
local start = #t
if res == nil then
res = t[start]
start = start - 1
end
for i=start,1,-1 do
res = f(t[i], res)
end
return res
end
local function map(f, t)
local result = {}
for i=1,#t do
result[i] = f(t[i])
end
return result
end
local function add(a,b)
return a+b
end
-- Convert a charset fragment to a PEG
local function charset_to_peg(charfrag)
local t = charfrag[0]
if t == "char" then
assert(#charfrag == 1)
return P(charfrag[1])
elseif t == "range" then
assert(#charfrag == 2)
return R(charfrag[1] .. charfrag[2])
elseif t == "set" then
return S(charfrag[1])
else
error("Got charset bit: "..tostring(t).."/"..tostring(t and t[0]))
end
end
local function re_to_peg(retab, k, patternProps)
local t = retab[0]
if t == "pattern" then
assert(#retab == 1)
local pat = re_to_peg(retab[1], k, patternProps)
-- If the pattern is anchored at the end, make it fail to match
-- if there's another byte. This must be done *before* wrapping
-- with the start/end markers, as once they've matched it's too late
-- to match the next item.
if retab.anchorend then
-- Disallow matching anything afterwards.
pat = pat * (-P(1))
end
-- Add match start/end markers
pat = _start * pat * _end
if not retab.anchorstart then
-- Match the pattern, or a character and try again.
pat = P{pat + 1*V(1)}
end
return pat
elseif t == "group" then
assert(#retab == 2)
-- print(debug.traceback())
patternProps.numGroups = patternProps.numGroups + 1
local grpname = tostring(retab[2])
local newk = Cg(Cp()/sub1, "e"..grpname) * k
local pat = re_to_peg(retab[1], newk, patternProps)
pat = Cg(Cp(), "s"..grpname) * pat
return pat
elseif t == "alt" then
if #retab == 1 then
return re_to_peg(retab[1], k, patternProps)
else
local parts = map(function(x)
return re_to_peg(x, k, patternProps)
end, retab)
return foldr(add, parts)
end
elseif t == "concat" then
return foldr(function(retab_f, k_f) return re_to_peg(retab_f, k_f, patternProps) end, retab, k)
elseif t == "char" then
assert(#retab == 1)
return P(retab[1]) * k
elseif t == "charset" then
local charset_pat = foldr(add, map(charset_to_peg, retab))
if retab.negate then
charset_pat = 1 - charset_pat
end
return charset_pat * k
elseif t == "*" then
return P{"A", A=re_to_peg(retab[1], V"A", patternProps) + k}
elseif t == "+" then
return re_to_peg(retab[1], P{"A", A=re_to_peg(retab[1], V"A", patternProps) + k}, patternProps)
elseif t == "." then
assert(#retab == 0)
return (P(1) - P"\n") * k
elseif t == "?" then
assert(#retab == 1)
return re_to_peg(retab[1], k, patternProps) + k
elseif t == "{}" then
assert(#retab == 1)
-- Rewrite this in terms of ? and *.
-- X{3,} => XXXX*
-- X{3,5} => XXXX?X?
local subpat = retab[1]
local min = retab.min
local max = retab.max
local rewritten = { [0] = "concat" }
for i=1,min do
rewritten[#rewritten+1] = subpat
end
if max == nil then
rewritten[#rewritten+1] = { [0] = "*", subpat }
else
local optional = { [0] = "?", subpat }
for i=min+1,max do
rewritten[#rewritten+1] = optional
end
end
return re_to_peg(rewritten, k, patternProps)
elseif t == "\\<" then
assert(#retab == 0)
return -B(wordchar) * #wordchar * k
elseif t == "\\>" then
assert(#retab == 0)
return B(wordchar) * (-#wordchar) * k
elseif t == "backref" then
local grpname = retab[1]
return Cmt(P(0) * Cb("s"..grpname) * Cb("e"..grpname),
function(subject, pos, s, e)
local backval = subject:sub(s, e)
local here = subject:sub(pos, pos+e-s)
if backval == here then
return pos+e-s+1
else
return false
end
end)
else
error("Not implemented op: " ..tostring(t) .. "/" .. tostring(retab))
end
end
function M.parse(re)
return pattern:match(re, 1, {open={}})
end
function M.compile(re)
-- Since the RE->Peg construction starts backwards (using the
-- continuation), it's more convenient to parse the regular expression
-- backwards.
local retab = M.parse(re)
if retab == nil then
error("Failed to parse regular expression: {"..re.."}", 2)
end
local patternProps = { numGroups = 0 }
local _pat = re_to_peg(retab, P(0), patternProps)
return setmetatable({
_pat = Ct(_pat),
numgroups = patternProps.numGroups
}, mt)
end
-- Increase match complexity
lpeg.setmaxstack(1000)
return M