Jump to content

မေႃႇၵျူး:string utilities: လွင်ႈပႅၵ်ႇပိူင်ႈ ၼႂ်းၵႄႈ လွင်ႈၶူၼ်ႉၶႆႈ

လုၵ်ႉတီႈ ဝိၵ်ႇသျိၼ်ႇၼရီႇ မႃး
Content deleted Content added
No edit summary
No edit summary
Tag: Reverted
ထႅဝ် 1: ထႅဝ် 1:
local mw = mw
local string = string
local table = table
local ustring = mw.ustring

local byte = string.byte
local char = string.char
local concat = table.concat
local find = string.find
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local len = string.len
local load_data = mw.loadData
local lower = string.lower
local match = string.match
local next = next
local reverse = string.reverse
local select = select
local sort = table.sort
local sub = string.sub
local tonumber = tonumber
local tostring = tostring
local type = type
local ucodepoint = ustring.codepoint
local ufind = ustring.find
local ugcodepoint = ustring.gcodepoint
local ugmatch = ustring.gmatch
local ugsub = ustring.gsub
local ulower = ustring.lower
local umatch = ustring.match
local unpack = unpack
local upper = string.upper
local usub = ustring.sub
local uupper = ustring.upper
-- Defined below.
local charset_escape
local codepoint
local explode_utf8
local format_fun
local get_indefinite_article
local pattern_escape
local pattern_simplifier
local php_trim
local replacement_escape
local u
local ulen

local module_name = "string_utilities"
local module_name = "string_utilities"

local export = {}
local export = {}


--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
local format_escapes = {
function export.explode_utf8(str)
["op"] = "{",
["cl"] = "}",
local text, i = {}, 0
for ch in gmatch(str, ".[\128-\191]*") do
}
i = i + 1
text[i] = ch
end
return text
end
explode_utf8 = export.explode_utf8


--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{code|lua|"^$()%.[]*+-?"}} becomes {{code|lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
function export.format_fun(str, fun)
function export.pattern_escape(str)
return (str:gsub("{(\\?)((\\?)[^{}]*)}", function (p1, name, p2)
return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0"))
if #p1 + #p2 == 1 then
return format_escapes[name] or error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
else
if fun(name) and type(fun(name)) ~= "string" then
error(module_name .. ".format: '" .. name .. "' is a " .. type(fun(name)) .. ", not a string")
end
return fun(name) or error(module_name .. ".format: '" .. name .. "' not found in table")
end
end))
end
end
pattern_escape = export.pattern_escape


--[==[Escapes only the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>.]==]
--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
function export.charset_escape(str)
====Examples====
return (gsub(str, "[%%%-%]^]", "%%%0"))
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
return export.format_fun(str, function (key) return tbl[key] end)
end
end
charset_escape = export.charset_escape


--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
--[==[Explodes a string into an array of UTF8 characters. '''Warning''': this function has no safety checks for non-UTF8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
function export.explode_utf8(str)
function export.replacement_escape(str)
return (gsub(str, "%%", "%%%%"))
local byte = string.byte
end
local sub = string.sub
replacement_escape = export.replacement_escape

local str_len = #str
do
local text = {}
local n, i, b = 1, 0
local function check_sets_equal(set1, set2)
local k2
for k1, v1 in next, set1 do
while n <= str_len do
local v2 = set2[k1]
b = byte(str, n)
if v1 ~= v2 and (v2 == nil or not check_sets_equal(v1, v2)) then
i = i + 1
return false
if b < 0xC0 then
end
text[i] = sub(str, n, n)
n = n + 1
k2 = next(set2, k2)
elseif b < 0xE0 then
text[i] = sub(str, n, n + 1)
n = n + 2
elseif b < 0xF0 then
text[i] = sub(str, n, n + 2)
n = n + 3
else
text[i] = sub(str, n, n + 3)
n = n + 4
end
end
return next(set2, k2) == nil
end
end
local function check_sets(bytes)
return text
local key, set1, set = next(bytes)
end
if set1 == true then

return true
-- A helper function which takes a string, position and type ("byte" or "char"), and returns the equivalent position for the other type (e.g. iterate_utf8("字典", 2, "char") returns 4, because character 2 of "字典" begins with byte 4). `pos` can be positive or negative, and the function will iterate over the string forwards or backwards (respectively) until it reaches the input position. Checks byte-by-byte; skipping over trailing bytes, and then calculating the correct byte trail for any leading bytes (i.e. how many trailing bytes should follow); these trailing bytes are then checked together.
elseif not check_sets(set1) then
-- The optional parameters `init_from_type` and `init_to_type` can be used to start part-way through an iteration to improve performance, if multiple values need to be returned from the same string. For example, iterate_utf8("слова́рь", 11, "byte", 5, 3) will begin checking at byte 5/the start of character 3. Note: The function won't check if these values match each other (as the only way to do this would be to run the iteration from the beginning), so mismatched values will return incorrect results.
return false
local function iterate_utf8(text, pos, from_type, init_from_type, init_to_type)
end
-- Position 0 is always valid and never changes.
while true do
if pos == 0 then
key, set = next(bytes, key)
return pos
if not key then
return true
elseif not check_sets_equal(set, set1) then
return false
end
end
end
end
local function make_charset(range)
local to_type
if from_type == "char" then
if #range == 1 then
return char(range[1])
to_type = "byte"
end
else
sort(range)
to_type = "char"
local compressed, n, start = {}, 0, range[1]
for i = 1, #range do
local this, nxt = range[i], range[i + 1]
if nxt ~= this + 1 then
n = n + 1
compressed[n] = this == start and char(this) or
char(start) .. "-" .. char(this)
start = nxt
end
end
return "[" .. concat(compressed) .. "]"
end
end
local function parse_1_byte_charset(pattern, pos)
-- Positive positions iterate forwards; negative positions iterate backwards.
while true do
local iterate_val
local ch, nxt_pos
if pos > 0 then
pos, ch, nxt_pos = match(pattern, "()([%%%]\194-\244][\128-\191]*)()", pos)
iterate_val = 1
if not ch then
else
return false
iterate_val = -1
elseif ch == "%" then
if match(pattern, "^[acdlpsuwxACDLPSUWXZ\128-\255]", nxt_pos) then
return false
end
pos = pos + 2
elseif ch == "]" then
pos = nxt_pos
return pos
else
return false
end
end
end
end
--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]
-- Adjust init_from_type and init_to_type to the iteration before, so that matches for the position given by them will work.
pattern_simplifier = require("Module:fun").memoize(function(pattern)
local trail, cp, min, b = 0
if type(pattern) == "number" then
local c, leading_byte = {}
return tostring(pattern)
c[from_type] = init_from_type and init_from_type ~= 0 and init_from_type - iterate_val or 0
c[to_type] = init_to_type and init_to_type ~= 0 and init_to_type - iterate_val or 0
while true do
if pos > 0 then
b = text:byte(c.byte + 1)
else
b = text:byte(text:len() + c.byte)
end
end
local pos, captures, start, n, output = 1, 0, 1, 0
-- Position byte doesn't exist, so iterate the return value and return it.
if not b then
while true do
local ch, nxt_pos
return c[to_type] + iterate_val
pos, ch, nxt_pos = match(pattern, "()([%%(.[\194-\244][\128-\191]*)()", pos)
elseif b < 0x80 then
if not ch then
-- 1-byte codepoint, 00-7F.
trail = 0
break
cp = b
min = 0
leading_byte = true
elseif b < 0xc0 then
-- A trailing byte.
leading_byte = false
elseif b < 0xc2 then
-- An overlong encoding for a 1-byte codepoint.
error("String " .. text .. " is not UTF-8.")
elseif b < 0xe0 then
-- 2-byte codepoint, C2-DF.
trail = 1
cp = b - 0xc0
min = 0x80
leading_byte = true
elseif b < 0xf0 then
-- 3-byte codepoint, E0-EF.
trail = 2
cp = b - 0xe0
min = 0x800
leading_byte = true
elseif b < 0xf4 then
-- 4-byte codepoint, F0-F3.
trail = 3
cp = b - 0xf0
min = 0x10000
leading_byte = true
elseif b == 0xf4 then
-- 4-byte codepoint, F4.
-- Make sure it doesn't decode to over U+10FFFF.
if text:byte(c.byte + 2) > 0x8f then
error("String " .. text .. " is not UTF-8.")
end
end
local nxt = sub(pattern, nxt_pos, nxt_pos)
trail = 3
cp = 4
if ch == "%" then
min = 0x100000
if nxt == "b" then
if not match(pattern, "^()[^\128-\255][^\128-\255]", pos + 2) then
leading_byte = true
return false
else
end
-- Codepoint over U+10FFFF, or invalid byte.
pos = pos + 4
error("String " .. text .. " is not UTF-8.")
elseif nxt == "f" then
end
pos = pos + 2
if not match(pattern, "^()%[[^^]", pos) then
-- Check subsequent bytes for multibyte codepoints.
return false
if leading_byte then
end
local from, to
-- Only possible to convert a %f charset which is all
if pos > 0 then
-- ASCII, so use parse_1_byte_charset.
from, to = c.byte + 2, c.byte + 1 + trail
pos = parse_1_byte_charset(pattern, pos)
if not pos then
return false
end
elseif nxt == "Z" then
pos = pos + 2
nxt = sub(pattern, pos, pos)
if nxt == "*" or nxt == "+" or nxt == "-" then
pos = pos + 1
else
output = output or {}
n = n + 1
if nxt == "?" then
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244]?[\128-\191]*"
pos = pos + 1
else
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244][\128-\191]*"
end
start = pos
end
elseif find("acdlpsuwxACDLPSUWX", nxt, 1, true) then
return false
-- Skip the next character if it's ASCII. Otherwise, we will
-- still need to do length checks.
else
pos = pos + (byte(nxt) < 128 and 2 or 1)
end
elseif ch == "(" then
if nxt == ")" or captures == 32 then
return false
end
captures = captures + 1
pos = pos + 1
elseif ch == "." then
if nxt == "*" or nxt == "+" or nxt == "-" then
pos = pos + 2
else
output = output or {}
n = n + 1
if nxt == "?" then
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191]?[\128-\191]*"
pos = pos + 2
else
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191][\128-\191]*"
pos = pos + 1
end
start = pos
end
elseif ch == "[" then
-- Fail negative charsets. TODO: 1-byte charsets should be safe.
if nxt == "^" then
return false
-- If the first character is "%", ch_len is determined by the
-- next one instead.
elseif nxt == "%" then
nxt_pos = nxt_pos + 1
nxt = sub(pattern, nxt_pos, nxt_pos)
end
local ch_len = #match(pattern, "^.[\128-\191]*", nxt_pos)
if ch_len == 1 then -- Single-byte charset.
pos = parse_1_byte_charset(pattern, pos + 1)
if not pos then
return false
end
else -- Multibyte charset.
local charset_pos, bytes = pos
pos = pos + 1
while true do -- TODO: non-ASCII charset ranges.
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
if not ch then
return false
-- If escaped, get the next character. No need to
-- distincguish magic characters or character classes,
-- as they'll all fail for having the wrong length
-- anyway.
elseif ch == "%" then
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
elseif ch == "]" then
pos = nxt_pos
break
end
if ch_len ~= #ch then
return false
end
bytes = bytes or {}
local bytes = bytes
for i = 1, ch_len - 1 do
local b = byte(ch, i, i)
bytes[b] = bytes[b] or {}
bytes = bytes[b]
end
bytes[byte(ch, -1)] = true
pos = nxt_pos
end
if not pos then
return false
end
local nxt = sub(pattern, pos, pos)
if (
(nxt == "?" or nxt == "*" or nxt == "-") or
(nxt == "+" and ch_len > 2) or
not check_sets(bytes)
) then
return false
end
local ranges, b, key, next_byte = {}, 0
repeat
key, next_byte = next(bytes)
local range, n = {key}, 1
-- Loop starts on the second iteration.
for key in next, bytes, key do
n = n + 1
range[n] = key
end
b = b + 1
ranges[b] = range
bytes = next_byte
until next_byte == true
if nxt == "+" then
local range1, range2 = ranges[1], ranges[2]
ranges[1] = make_charset(range1)
ranges[3] = make_charset(range2)
local n = #range2
for i = 1, #range1 do
n = n + 1
range2[n] = range1[i]
end
ranges[2] = make_charset(range2) .. "*"
pos = pos + 1
else
for i = 1, #ranges do
ranges[i] = make_charset(ranges[i])
end
end
output = output or {}
n = n + 1
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges)
start = pos
end
elseif nxt == "+" then
if #ch ~= 2 then
return false
end
output = output or {}
n = n + 1
output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. sub(ch, 2, 2)
pos = nxt_pos + 1
start = pos
elseif nxt == "?" or nxt == "*" or nxt == "-" then
return false
else
else
pos = nxt_pos
from, to = text:len() + c.byte + 1, text:len() + c.byte + trail
end
end
end
for trailing_byte = from, to do
if start == 1 then
b = text:byte(trailing_byte)
return pattern
if not b or b < 0x80 or b > 0xbf then
end
error("String " .. text .. " is not UTF-8.")
return concat(output) .. sub(pattern, start)
end
end, true)
cp = cp * 0x40 + b - 0x80
export.pattern_simplifier = pattern_simplifier -- For testing.
end
end
local next_byte = text:byte(to + 1)

if next_byte and next_byte >= 0x80 and next_byte <= 0xbf then
function export.len(str)
-- Too many trailing bytes.
return type(str) == "number" and len(str) or
error("String " .. text .. " is not UTF-8.")
#str - #gsub(str, "[^\128-\191]+", "")
elseif cp < min then
end
-- Overlong encoding.
ulen = export.len
error("String " .. text .. " is not UTF-8.")

function export.sub(str, i, j)
str, i = type(str) == "number" and tostring(str) or str, i or 1
if i < 0 or j and j < 0 then
return usub(str, i, j)
elseif j and i > j or i > #str then
return ""
end
local n, new_i = 0
for loc1, loc2 in gmatch(str, "()[^\128-\191]+()[\128-\191]*") do
n = n + loc2 - loc1
if not new_i and n >= i then
new_i = loc2 - (n - i) - 1
if not j then
return sub(str, new_i)
end
end
end
end
if j and n > j then
return sub(str, new_i, loc2 - (n - j) - 1)
c.byte = c.byte + iterate_val
if leading_byte then
c.char = c.char + iterate_val
end
end
end
return new_i and sub(str, new_i) or ""
if c[from_type] == pos then
end
return c[to_type]

do
local function _find(str, loc1, loc2, ...)
if loc1 and not match(str, "^()[^\128-\255]*$") then
-- Use raw values of loc1 and loc2 to get loc1 and the length of the match.
loc1, loc2 = ulen(sub(str, 1, loc1)), ulen(sub(str, loc1, loc2))
-- Offset length with loc1 to get loc2.
loc2 = loc1 + loc2 - 1
end
end
return loc1, loc2, ...
end
--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
function export.find(str, pattern, init, plain)
init = init or 1
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
return ufind(str, pattern, init, plain)
elseif plain then
return _find(str, find(str, pattern, init, true))
end
local simple = pattern_simplifier(pattern)
if simple then
return _find(str, find(str, simple, init))
end
return ufind(str, pattern, init)
end
end
end
end


--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
--[==[Converts a character position to the equivalent byte position.]==]
function export.charsToBytes(text, pos)
function export.match(str, pattern, init)
init = init or 1
return iterate_utf8(text, pos, "char")
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
return umatch(str, pattern, init)
end
local simple = pattern_simplifier(pattern)
if simple then
return match(str, simple, init)
end
return umatch(str, pattern, init)
end
end


--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
--[==[Converts a byte position to the equivalent character position.]==]
function export.bytesToChars(text, pos)
function export.gmatch(str, pattern)
local byte = text:byte(pos)
local simple = pattern_simplifier(pattern)
if simple then
if byte and byte >= 0x80 and byte <= 0xbf then
return gmatch(str, simple)
error("Byte " .. pos .. " is not a leading byte.")
end
end
return iterate_utf8(text, pos, "byte")
return ugmatch(str, pattern)
end
end


--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
-- A helper function which iterates through a pattern, and returns two values: a potentially modified version of the pattern, and a boolean indicating whether the returned pattern is simple (i.e. whether it can be used with the stock string library); if not, then the pattern is complex (i.e. it must be used with the ustring library, which is much more resource-intensive).
local function patternSimplifier(text, pattern, plain)
function export.gsub(str, pattern, repl, n)
pattern = tostring(pattern)
local simple = pattern_simplifier(pattern)
if simple then
-- If `plain` is set, then the pattern is treated as literal (so is always simple). Only used by find.
return gsub(str, simple, repl, n)
if plain then
return pattern, true
--If none of these are present, then the pattern has to be simple.
elseif not (
pattern:match("%[.-[\128-\255].-%]") or
pattern:match("[\128-\255][%*%+%?%-]") or
pattern:match("%%[abcdlpsuwxACDLPSUWXZ]") or
pattern:match("%[%^[^%]]+%]") or
pattern:match("%.[^%*%+%-]") or
pattern:match("%.$") or
pattern:match("%%b.?[\128-\255]") or
pattern:match("()", 1, true)
) then
return pattern, true
end
end
-- Otherwise, the pattern could go either way.
return ugsub(str, pattern, repl, n)
end
-- Build up the new pattern in a table, then concatenate at the end. we do it this way, as occasionally entries get modified along the way.

local new_pattern = {}
--[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==]
local len, pos, b = pattern:len(), 0
function export.plain_gsub(str, pattern, repl, n)
local char, next_char
return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
end
-- `escape` and `balanced` are counters, which ensure the effects of % or %b (respectively) are distributed over the following bytes.

-- `set` is a boolean that states whether the current byte is in a charset.
--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]
-- `capture` keeps track of how many layers of capture groups the position is in, while `captures` keeps a tally of how many groups have been detected (due to the string library limit of 32).
function export.reverse(str)
local escape, set, balanced, capture, captures = 0, false, 0, 0, 0
return reverse(gsub(str, "[\194-\244][\128-\191]*", reverse))
end
while pos < len do

pos = pos + 1
do
b = pattern:byte(pos)
local function err(cp)
if escape > 0 then escape = escape - 1 end
error("Codepoint " .. cp .. " is out of range: codepoints must be between 0x0 and 0x10FFFF.", 2)
if balanced > 0 then balanced = balanced - 1 end
end
char = next_char or pattern:sub(pos, pos)

next_char = pattern:sub(pos + 1, pos + 1)
local function utf8_char(cp)
if escape == 0 then
cp = tonumber(cp)
if char == "%" then
-- Apply % escape.
if cp < 0 then
err("-0x" .. format("%X", -cp + 1))
if next_char == "." or next_char == "%" or next_char == "[" or next_char == "]" then
elseif cp < 0x80 then
escape = 2
return char(cp)
if balanced > 0 then balanced = balanced + 1 end
elseif cp < 0x800 then
-- These charsets make the pattern complex.
return char(
elseif next_char:match("[acdlpsuwxACDLPSUWXZ]") then
0xC0 + cp / 0x40,
return pattern, false
-- This is "%b".
0x80 + cp % 0x40
)
elseif next_char == "b" then
elseif cp < 0x10000 then
balanced = 4
if cp >= 0xD800 and cp < 0xE000 then
end
return "?" -- mw.ustring.char returns "?" for surrogates.
-- Enter or leave a charset.
elseif char == "[" then
set = true
elseif char == "]" then
set = false
elseif char == "(" then
capture = capture + 1
elseif char == ")" then
if capture > 0 and set == false and balanced == 0 then
captures = captures + 1
capture = capture - 1
end
end
end
return char(
0xE0 + cp / 0x1000,
0x80 + cp / 0x40 % 0x40,
0x80 + cp % 0x40
)
elseif cp < 0x110000 then
return char(
0xF0 + cp / 0x40000,
0x80 + cp / 0x1000 % 0x40,
0x80 + cp / 0x40 % 0x40,
0x80 + cp % 0x40
)
end
end
err("0x" .. format("%X", cp))
end
-- Multibyte char.

if b > 0x7f then
function export.char(cp, ...)
-- If followed by "*", "+" or "-", then 2-byte chars can be converted into charsets. However, this is not possible with 3 or 4-byte chars, as the charset would be too permissive, because if the trailing bytes were in a different order then this could be a different valid character.
if next_char == "*" or next_char == "+" or next_char == "-" then
if ... == nil then
return utf8_char(cp)
local prev_pos = pattern:byte(pos - 1)
if prev_pos > 0xc1 and prev_pos < 0xe0 then
new_pattern[#new_pattern] = "[" .. new_pattern[#new_pattern]
table.insert(new_pattern, char .. "]")
else
return pattern, false
end
-- If in a charset or used in "%b", then the pattern is complex.
-- If followed by "?", add "?" after each byte.
elseif next_char == "?" then
table.insert(new_pattern, char .. "?")
local check_pos, check_b, i = pos, pattern:byte(pos), #new_pattern
while check_b and check_b < 0xc0 do
check_pos = check_pos - 1
check_b = pattern:byte(check_pos)
i = i - 1
new_pattern[i] = new_pattern[i] .. "?"
end
pos = pos + 1
next_char = pattern:sub(pos + 1, pos + 1)
elseif set or balanced > 0 then
return pattern, false
else
table.insert(new_pattern, char)
end
elseif char == "." then
-- "*", "+", "-" are always okay after ".", as they don't care how many bytes a char has.
if set or next_char == "*" or next_char == "+" or next_char == "-" or escape > 0 then
table.insert(new_pattern, char)
-- If followed by "?", make sure "?" is after the leading byte of the UTF-8 char pattern, then skip forward one.
elseif next_char == "?" then
table.insert(new_pattern, "[%z\1-\127\194-\244]?[\128-\191]*")
pos = pos + 1
next_char = pattern:sub(pos + 1, pos + 1)
-- If used with "%b", pattern is complex.
elseif balanced > 0 then
return pattern, false
-- Otherwise, add the UTF-8 char pattern.
else
table.insert(new_pattern, "[%z\1-\127\194-\244][\128-\191]*")
end
-- Negative charsets are always complex, unless the text has no UTF-8 chars.
elseif char == "[" and next_char == "^" and escape == 0 and text:match("[\128-\255]") then
return pattern, false
-- "()" matches the position unless escaped or used with "%b", so always necessitates ustring (as we need it to match the char position, not the byte one).
elseif char == "(" and next_char == ")" and balanced == 0 and escape == 0 and text:match("[\128-\255]") then
return pattern, false
else
table.insert(new_pattern, char)
end
end
local ret = {cp, ...}
for i = 1, select("#", cp, ...) do
ret[i] = utf8_char(ret[i])
end
return concat(ret)
end
end
u = export.char
if captures > 32 then
return pattern, false
else
pattern = table.concat(new_pattern)
return pattern, true
end
end
end


do
--[==[A version of len which uses string.len, but returns the same result as mw.ustring.len.]==]
function export.len(text)
local function get_codepoint(b1, b2, b3, b4)
if b1 < 128 then
text = tostring(text)
return b1, 1
local len_bytes = text:len()
elseif b1 < 224 then
if not text:match("[\128-\255]") then
return len_bytes
return 0x40 * b1 + b2 - 0x3080, 2
elseif b1 < 240 then
else
return iterate_utf8(text, len_bytes, "byte")
return 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080, 3
end
return 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080, 4
end
end
end


function export.codepoint(str, i, j)
--[==[A version of sub which uses string.sub, but returns the same result as mw.ustring.sub.]==]
if type(str) == "number" then
function export.sub(text, i_char, j_char)
return byte(str, i, j)
text = tostring(text)
end
if not text:match("[\128-\255]") then
i, j = i or 1, j == -1 and #str or i or 1
return text:sub(i_char, j_char)
if i == 1 and j == 1 then
return (get_codepoint(byte(str, 1, 4)))
elseif i < 0 or j < 0 then
return ucodepoint(str, i, j) -- FIXME
end
local n, nb, ret, nr = 0, 1, {}, 0
while n < j do
n = n + 1
if n < i then
local b = byte(str, nb)
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
else
local b1, b2, b3, b4 = byte(str, nb, nb + 3)
if not b1 then
break
end
nr = nr + 1
local add
ret[nr], add = get_codepoint(b1, b2, b3, b4)
nb = nb + add
end
end
return unpack(ret)
end
end
codepoint = export.codepoint
local i_byte, j_byte
if j_char then
function export.gcodepoint(str, i, j)
if i_char > 0 and j_char > 0 then
i, j = i or 1, j ~= -1 and j or nil
if j_char < i_char then return "" end
if i < 0 or j and j < 0 then
i_byte = iterate_utf8(text, i_char, "char")
return ugcodepoint(str, i, j) -- FIXME
j_byte = iterate_utf8(text, j_char + 1, "char", i_char, i_byte) - 1
end
elseif i_char < 0 and j_char < 0 then
local n, nb = 1, 1
if j_char < i_char then return "" end
while n < i do
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
local b = byte(str, nb)
i_byte = iterate_utf8(text, i_char, "char", j_char, j_byte)
if not b then
-- For some reason, mw.ustring.sub with i=0, j=0 returns the same result as for i=1, j=1, while string.sub always returns "". However, mw.ustring.sub does return "" with i=1, j=0. As such, we need to adjust j_char to 1 if i_char is either 0, or negative with a magnitude greater than the length of the string.
break
elseif j_char == 0 then
end
i_byte = iterate_utf8(text, i_char, "char")
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
if i_byte == 0 or -i_byte > text:len() then j_char = 1 end
n = n + 1
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
else
end
i_byte = iterate_utf8(text, i_char, "char")
return function()
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
if j and n > j then
return nil
end
n = n + 1
local b1, b2, b3, b4 = byte(str, nb, nb + 3)
if not b1 then
return nil
end
local ret, add = get_codepoint(b1, b2, b3, b4)
nb = nb + add
return ret
end
end
else
i_byte = iterate_utf8(text, i_char, "char")
end
end
return text:sub(i_byte, j_byte)
end
end


--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
function export.lower(text)
function export.lower(str)
return (match(str, "^()[^\128-\255]*$") and lower or ulower)(str)
text = tostring(text)
if not text:match("[\128-\255]") then
return text:lower()
else
return mw.ustring.lower(text)
end
end
end


--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
function export.upper(text)
function export.upper(str)
return (match(str, "^()[^\128-\255]*$") and upper or uupper)(str)
text = tostring(text)
if not text:match("[\128-\255]") then
return text:upper()
else
return mw.ustring.upper(text)
end
end
end


do
--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
function export.find(text, pattern, init_char, plain)
local function add_captures(text, n, ...)
-- Insert any captures from the splitting pattern.
text = tostring(text)
local offset, capture = n - 1, ...
local simple
while capture do
pattern, simple = patternSimplifier(text, pattern, plain)
n = n + 1
-- If the pattern is simple but multibyte characters are present, then init_char needs to be converted into bytes for string.find to work properly, and the return values need to be converted back into chars.
text[n] = capture
if simple then
capture = select(n - offset, ...)
if not text:match("[\128-\255]") then
end
return text:find(pattern, init_char, plain)
return n
end
local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...)
if not (loc1 and start <= str_len) then
-- If no match, or there is but we're past the end of the string
-- (which happens when the match is the empty string), then add
-- the final chunk and return.
n = n + 1
text[n] = _sub(str, start)
return
elseif loc2 < loc1 then
-- Special case: If we match the empty string, then include the
-- next character; this avoids an infinite loop, and makes
-- splitting by an empty string work the way mw.text.split() does
-- (including non-adjacent empty string matches with %f). If we
-- reach the end of the string this way, return immediately, so we
-- don't get a final empty string. If using the string library, we
-- need to make sure we advance by one UTF-8 character.
if _sub == sub then
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
end
n = n + 1
text[n] = _sub(str, start, loc1)
start = loc1 + 1
if start > str_len then
return ... and add_captures(text, n, ...) or n
end
else
else
-- Add chunk up to the current match.
local init_byte = init_char and iterate_utf8(text, init_char, "char")
n = n + 1
local byte1, byte2, c1, c2, c3, c4, c5, c6, c7, c8, c9 = text:find(pattern, init_byte, plain)
text[n] = _sub(str, start, loc1 - 1)
start = loc2 + 1
-- If string.find returned nil, then return nil.
end
if not (byte1 and byte2) then
return (... and add_captures(text, n, ...) or n), start
return nil
end
local function _split(str, pattern, str_len, _sub, _find, plain)
local text, n, start = {}, 0, 1
repeat
n, start = iterate(str, str_len, text, n, start, _sub, _find(str, pattern, start, plain))
until not start
return text
end
--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==]
function export.split(str, pattern, str_lib, plain)
if str_lib or plain then
return _split(str, pattern, #str, sub, find, plain)
end
local simple = pattern_simplifier(pattern)
if simple then
return _split(str, simple, #str, sub, find)
end
return _split(str, pattern, ulen(str), usub, ufind)
end
export.capturing_split = export.split -- To be removed.
end

do
-- TODO: merge this with export.split. Not clear how to do this while
-- maintaining the same level of performance, as gsplit is slower.
local function _split(str, pattern, str_len, _sub, _find, plain)
local start, final = 1
local function iter(loc1, loc2, ...)
-- If no match, return the final chunk.
if not loc1 then
final = true
return _sub(str, start)
end
end
-- Special case: If we match the empty string, then eat the
-- next character; this avoids an infinite loop, and makes
-- Get first return value. If we have a positive init_char, we can save resources by resuming at that point.
-- splitting by the empty string work the way mw.text.gsplit() does
local char1, char2
-- (including non-adjacent empty string matches with %f). If we
if (not init_char) or init_char > 0 then
-- reach the end of the string this way, set `final` to true, so we
char1 = iterate_utf8(text, byte1, "byte", init_byte, init_char)
-- don't get stuck matching the empty string at the end.
local chunk
if loc2 < loc1 then
-- If using the string library, we need to make sure we advance
-- by one UTF-8 character.
if _sub == sub then
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
end
chunk = _sub(str, start, loc1)
if loc1 >= str_len then
final = true
else
start = loc1 + 1
end
-- Eat chunk up to the current match.
else
else
char1 = iterate_utf8(text, byte1, "byte")
chunk = _sub(str, start, loc1 - 1)
start = loc2 + 1
end
end
return chunk, ...
end
-- If byte1 and byte2 are the same, don't bother running iterate_utf8 twice. Otherwise, resume iterate_utf8 from byte1 to find char2.
if byte1 == byte2 then
return function()
char2 = char1
if not final then
else
char2 = iterate_utf8(text, byte2, "byte", byte1, char1)
return iter(_find(str, pattern, start, plain))
end
end
return nil
return unpack{char1, char2, c1, c2, c3, c4, c5, c6, c7, c8, c9}
end
end
end
else
return mw.ustring.find(text, pattern, init_char, plain)
function export.gsplit(str, pattern, str_lib, plain)
if str_lib or plain then
return _split(str, pattern, #str, sub, find, plain)
end
local simple = pattern_simplifier(pattern)
if simple then
return _split(str, simple, #str, sub, find)
end
return _split(str, pattern, ulen(str), usub, ufind)
end
end
end
end


function export.trim(str, charset)
--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
if not charset then
function export.match(text, pattern, init)
return match(str, "^()%s*$") and "" or match(str, "^%s*(.*%S)")
text = tostring(text)
elseif match(charset, "^()[^\128-\255]*$") then
local simple
return match(str, "^()[" .. charset .. "]*$") and "" or match(str, "^[" .. charset .. "]*(.*[^" .. charset .. "])")
pattern, simple = patternSimplifier(text, pattern)
if simple then
if init and text:find("[\128-\255]") then
init = iterate_utf8(text, init, "char")
end
return text:match(pattern, init)
else
return mw.ustring.match(text, pattern, init)
end
end
return umatch(str, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$")
end
end


do
--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
local entities
function export.gmatch(text, pattern)

text = tostring(text)
local function decode_numeric_entity(code, pattern, base)
local simple
local cp = match(code, pattern) and tonumber(code, base)
pattern, simple = patternSimplifier(text, pattern)
return cp and cp < 0x110000 and u(cp) or nil
if simple then
end
return text:gmatch(pattern)

else
local function decode_entity(hash, x, code)
return mw.ustring.gmatch(text, pattern)
if hash == "#" then
return x == "" and decode_numeric_entity(code, "^%d+$") or
decode_numeric_entity(code, "^%x+$", 16)
end
entities = entities or load_data("Module:data/entities")
return entities[x .. code]
end

-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].
function export.decode_entities(str)
return find(str, "&", 1, true) and
gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
end
end
end
end


do
--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
local html_entities
function export.gsub(text, pattern, repl, n)
text = tostring(text)
local simple
local function encode_entity(ch)
local entity = html_entities[ch]
pattern, simple = patternSimplifier(text, pattern)
if simple then
if entity then
return text:gsub(pattern, repl, n)
return entity
end
else
entity = "&#" .. codepoint(ch) .. ";"
return mw.ustring.gsub(text, pattern, repl, n)
html_entities[ch] = entity
return entity
end
function export.encode_entities(str, charset, str_lib, plain)
-- Memoized HTML entities (taken from mw.text.lua).
html_entities = html_entities or {
["\""] = "&quot;",
["&"] = "&amp;",
["'"] = "&#039;",
["<"] = "&lt;",
[">"] = "&gt;",
["\194\160"] = "&nbsp;",
}
if not charset then
return (gsub(str, "[\"&'<>\194]\160?", html_entities))
elseif plain then
return (gsub(str, "[" .. charset_escape(charset) .. "]", encode_entity))
elseif str_lib then
if not match(charset, "^()[^\128-\255]*$") then
error("Cannot use the string library with a character set that contains a character with a codepoint above U+007F.")
end
return (gsub(str, "[" .. charset .. "]", encode_entity))
end
local pattern = charset and "[" .. charset .. "]"
local simple = pattern_simplifier(pattern)
if simple then
return (gsub(str, simple, encode_entity))
end
return (ugsub(str, pattern, encode_entity))
end
end
end
end


do
--[==[
local function decode_path(code)
-- Reimplementation of mw.ustring.split() that includes any capturing
return char(tonumber(code, 16))
-- groups in the splitting pattern. This works like Python's re.split()
end
-- function, except that it has Lua's behavior when the split pattern
-- is empty (i.e. advancing by one character at a time; Python returns the
local function decode(lead, trail)
-- whole remainder of the string).
if lead == "+" or lead == "_" then
]==]
return " " .. trail
function export.capturing_split(str, pattern)
elseif #trail == 2 then
local ret = {}
return decode_path(trail)
-- (.-) corresponds to (.*?) in Python or Perl; () captures the
-- current position after matching.
pattern = "(.-)" .. pattern .. "()"
local start = 1
while true do
-- Did we reach the end of the string?
if start > #str then
table.insert(ret, "")
return ret
end
end
return lead .. trail
-- match() returns all captures as multiple return values;
end
-- we need to insert into a table to get them all.
local captures = {export.match(str, pattern, start)}
function export.decode_uri(str, enctype)
-- If no match, add the remainder of the string.
enctype = enctype and upper(enctype) or "QUERY"
if #captures == 0 then
if enctype == "PATH" then
table.insert(ret, export.sub(str, start))
return ret
return find(str, "%", 1, true) and
gsub(str, "%%(%x%x)", decode_path) or str
elseif enctype == "QUERY" then
return (find(str, "%", 1, true) or find(str, "+", 1, true)) and
gsub(str, "([%%%+])(%x?%x?)", decode) or str
elseif enctype == "WIKI" then
return (find(str, "%", 1, true) or find(str, "_", 1, true)) and
gsub(str, "([%%_])(%x?%x?)", decode) or str
end
error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)
end
end

do
local function _remove_comments(str, pre)
local head = find(str, "<!--", 1, true)
if not head then
return str
end
end
local newstart = table.remove(captures)
local ret, n = {sub(str, 1, head - 1)}, 1
while true do
-- Special case: If we don't advance by any characters, then advance
local loc = find(str, "-->", head + 4, true)
-- by one character; this avoids an infinite loop, and makes splitting
if not loc then
-- by an empty string work the way mw.ustring.split() does. If we
return pre and concat(ret) or
-- reach the end of the string this way, return immediately, so we
concat(ret) .. sub(str, head)
-- don't get a final empty string.
if newstart == start then
table.insert(ret, export.sub(str, start, start))
table.remove(captures, 1)
start = start + 1
if start > #str then
return ret
end
end
head = loc + 3
else
loc = find(str, "<!--", head, true)
table.insert(ret, table.remove(captures, 1))
start = newstart
if not loc then
return concat(ret) .. sub(str, head)
end
n = n + 1
ret[n] = sub(str, head, loc - 1)
head = loc
end
end
end
-- Insert any captures from the splitting pattern.
for _, x in ipairs(captures) do
--[==[Removes any HTML comments from the input text. `stage` can be one of three options:
table.insert(ret, x)
* {{lua|"PRE"}} (default) applies the method used by MediaWiki's preprocessor: all {{code||<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code||<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
* {{lua|"POST"}} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code||<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code||<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code||<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {{lua|"PRE"}} method will have already been applied by the native parser.
* {{lua|"BOTH"}} applies {{lua|"PRE"}} then {{lua|"POST"}}.]==]
function export.remove_comments(str, stage)
if not stage or stage == "PRE" then
return _remove_comments(str, true)
end
local processed = stage == "POST" and _remove_comments(str) or
stage == "BOTH" and _remove_comments(str, true) or
error("bad argument #2 to \"remove_comments\" (expected PRE, POST, or BOTH)", 2)
while processed ~= str do
str = processed
processed = _remove_comments(str)
end
end
return str
end
end
end
end


--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{code|lua|"\0"}}, {{code|lua|"\t"}}, {{code|lua|"\n"}}, {{code|lua|"\v"}}, {{code|lua|"\r"}} and {{code|lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
local function uclcfirst(text, dolower)
local function douclcfirst(text)
function export.php_trim(str)
return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""
-- Actual function to re-case of the first letter.
end
local first_letter = export.sub(text, 1, 1)
php_trim = export.php_trim
first_letter = dolower and export.lower(first_letter) or export.upper(first_letter)

return first_letter .. export.sub(text, 2)
--[==[Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{code|lua|frame.args}} table). For example, {{code|lua|"1"}} is normalized to {{code|lua|1}} (a number), and {{code|lua|" foo "}} is normalized to {{code|lua|"foo"}}. If the input is not a string, it is returned unchanged.

After being trimmed with {{code|lua|export.php_trim}}, strings are converted to numbers if:
# They are integers, with no decimals (2.0) or leading zeroes (02).
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
# For positive values, they do not have a leading {{code|lua|+}} sign.]==]
function export.scribunto_param_key(key)
if type(key) ~= "string" then
return key
end
end
key = php_trim(key)
-- If there's a link at the beginning, re-case the first letter of the
if match(key, "^-?[1-9]%d*$") then
-- link text. This pattern matches both piped and unpiped links.
local num = tonumber(key)
-- If the link is not piped, the second capture (linktext) will be empty.
-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true.
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
return (
if link then
num <= 9007199254740991 and num >= -9007199254740991 or
return "[[" .. link .. "|" .. douclcfirst(linktext ~= "" and linktext or link) .. "]]" .. remainder
key == "9007199254740992" or
key == "-9007199254740992"
) and num or key
elseif key == "0" then
return 0
end
end
return douclcfirst(text)
return key
end
end


do
function export.ucfirst(text)
local byte_escapes
return uclcfirst(text, false)
local function escape_byte(b)
return byte_escapes[b] or format("\\%03d", byte(b))
end
function export.escape_bytes(str)
byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes
return (gsub(str, ".", escape_byte))
end
end
end


function export.lcfirst(text)
function export.format_fun(str, fun)
return (gsub(str, "{(\\?)((\\?)[^{}]*)}", function(p1, name, p2)
return uclcfirst(text, true)
if #p1 + #p2 == 1 then
return name == "op" and "{" or
name == "cl" and "}" or
error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
elseif fun(name) and type(fun(name)) ~= "string" then
error(module_name .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
end
return fun(name) or error(module_name .. ".format: \"" .. name .. "\" not found in table")
end))
end
format_fun = export.format_fun

--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
====Examples====
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
return format_fun(str, function(key)
return tbl[key]
end)
end
end


-- Faster version of mw.text.nowiki, with minor changes to match the PHP equivalent: ";" always escapes, and colons in unslashed protocols only escape after regex \b.
do
do
local function escape_char(str1, str2)
local function do_uclcfirst(str, case_func)
-- Actual function to re-case of the first letter.
if str2 then
local first_letter = case_func(match(str, "^.[\128-\191]*") or "")
return str1 .. "&#" .. str2:byte() .. ";"
return first_letter .. sub(str, #first_letter + 1)
end
return "&#" .. str1:byte() .. ";"
end
end
local function escape_uri(uri)
local function uclcfirst(str, case_func)
-- If there's a link at the beginning, re-case the first letter of the
local uri_schemes = mw.loadData("Module:string utilities/data").uri_schemes
-- link text. This pattern matches both piped and unpiped links.
return uri_schemes[uri:lower()] and uri .. "&#58;" or uri .. ":"
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
if link then
return "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
end
return do_uclcfirst(str, case_func)
end
end
function export.nowiki(text)
function export.ucfirst(str)
return (text
return uclcfirst(str, uupper)
:gsub("[\"&';<=>%[%]{|}]", escape_char)
:gsub("^[\t\n\r #%*:]", escape_char)
:gsub("([\n\r])([\t\n\r #%*:])", escape_char)
:gsub("%f[^%z\r\n]%-(%-%-%-)", "&#45;%1")
:gsub("__", "_&#95;")
:gsub("://", "&#58;//")
:gsub("(ISBN)(%s)", escape_char)
:gsub("(PMID)(%s)", escape_char)
:gsub("(RFC)(%s)", escape_char)
:gsub("([%w_]+):", escape_uri))
end
end
end


function export.capitalize(text)
function export.lcfirst(str)
return uclcfirst(str, ulower)
if type(text) == "table" then
-- allow calling from a template
text = text.args[1]
end
end
-- Capitalize multi-word that is separated by spaces
local function capitalize(w)
-- by uppercasing the first letter of each part.
return uclcfirst(w, uupper)
-- I assume nobody will input all CAP text.
end
w2 = {}
for w in export.gmatch(text, "%S+") do
--[==[Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.]==]
table.insert(w2, uclcfirst(w, false))
function export.capitalize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Capitalize multi-word that is separated by spaces
-- by uppercasing the first letter of each part.
-- I assume nobody will input all CAP text.
return (ugsub(str, "%S+", capitalize))
end
end
return table.concat(w2, " ")
end
end


do
function export.pluralize(text)
local function word_ends_in_consonant_plus_y(str)
if type(text) == "table" then
-- allow calling from a template
text = text.args[1]
end
-- Pluralize a word in a smart fashion, according to normal English rules.
-- 1. If word ends in consonant + -y, replace the -y with -ies.
-- 2. If the word ends in -s, -x, -z, -sh, -ch, add -es.
-- 3. Otherwise, add -s.
-- This handles links correctly:
-- 1. If a piped link, change the second part appropriately.
-- 2. If a non-piped link and rule #1 above applies, convert to a piped link
-- with the second part containing the plural.
-- 3. If a non-piped link and rules #2 or #3 above apply, add the plural
-- outside the link.
local function word_ends_in_consonant_plus_y(text)
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
-- apply to proper nouns, hence "the Gettys", "the public Ivys".
-- apply to proper nouns, hence "the Gettys", "the public Ivys".
ထႅဝ် 605: ထႅဝ် 964:
-- be important as this function is almost always called on common nouns
-- be important as this function is almost always called on common nouns
-- (e.g. parts of speech, place types).
-- (e.g. parts of speech, place types).
return text:find("[^aeiouAEIOU ]y$")
return find(str, "[^aeiouAEIOU ]y$")
end
end
local function word_takes_es_plural(text)
local function word_takes_es_plural(str)
return text:find("[sxz]$") or text:find("[cs]h$")
return find(str, "[sxz]$") or find(str, "[cs]h$")
end
end
local function do_pluralize(text)
local function do_pluralize(str)
if word_ends_in_consonant_plus_y(text) then
if word_ends_in_consonant_plus_y(str) then
-- avoid returning multiple values
-- avoid returning multiple values
local hack_single_retval = text:gsub("y$", "ies")
return (gsub(str, "y$", "ies"))
elseif word_takes_es_plural(str) then
return hack_single_retval
return str .. "es"
elseif word_takes_es_plural(text) then
return text .. "es"
else
return text .. "s"
end
end
return str .. "s"
end
end
-- Check for a link. This pattern matches both piped and unpiped links.
--[==[
-- If the link is not piped, the second capture (linktext) will be empty.
Pluralize a word in a smart fashion, according to normal English rules.
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
# If word ends in consonant + -y, replace the -y with -ies.
if link then
# If the word ends in -s, -x, -z, -sh, -ch, add -es.
if linktext ~= "" then
# Otherwise, add -s.

This handles links correctly:
# If a piped link, change the second part appropriately.
# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
]==]
function export.pluralize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
if not link then
return do_pluralize(str)
elseif linktext ~= "" then
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
elseif word_ends_in_consonant_plus_y(link) then
end
return beginning .. "[[" .. link .. "|" .. gsub(link, "y$", "ies") .. "]]"
if word_ends_in_consonant_plus_y(link) then
return beginning .. "[[" .. link .. "|" .. link:gsub("y$", "ies") .. "]]"
end
end
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
end
end
return do_pluralize(text)
end
end


do
function export.singularize(text)
local function do_singularize(str)
if type(text) == "table" then
local sing = match(str, "^(.-)ies$")
-- allow calling from a template
text = text.args[1]
end
-- Singularize a word in a smart fashion, according to normal English rules.
-- Works analogously to pluralize().
-- NOTE: This doesn't always work as well as pluralize(). Beware. It will
-- mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
-- 1. If word ends in -ies, replace -ies with -y.
-- 2. If the word ends in -xes, -shes, -ches, remove -es. [Does not affect
-- -ses, cf. "houses", "impasses".]
-- 3. Otherwise, remove -s.
-- This handles links correctly:
-- 1. If a piped link, change the second part appropriately. Collapse the
-- link to a simple link if both parts end up the same.
-- 2. If a non-piped link, singularize the link.
-- 3. A link like "[[parish]]es" will be handled correctly because the
-- code that checks for -shes etc. allows ] characters between the
-- 'sh' etc. and final -es.
local function do_singularize(text)
local sing = text:match("^(.-)ies$")
if sing then
if sing then
return sing .. "y"
return sing .. "y"
end
end
-- Handle cases like "[[parish]]es"
-- Handle cases like "[[parish]]es"
local sing = text:match("^(.-[sc]h%]*)es$")
return match(str, "^(.-[sc]h%]*)es$") or
if sing then
return sing
end
-- Handle cases like "[[box]]es"
-- Handle cases like "[[box]]es"
local sing = text:match("^(.-x%]*)es$")
match(str, "^(.-x%]*)es$") or
-- Handle regular plurals
if sing then
match(str, "^(.-)s$") or
return sing
-- Otherwise, return input
end
str
local sing = text:match("^(.-)s$")
if sing then
return sing
end
return text
end
end

local function collapse_link(link, linktext)
local function collapse_link(link, linktext)
if link == linktext then
if link == linktext then
return "[[" .. link .. "]]"
return "[[" .. link .. "]]"
else
return "[[" .. link .. "|" .. linktext .. "]]"
end
end
return "[[" .. link .. "|" .. linktext .. "]]"
end
end
--[==[
Singularize a word in a smart fashion, according to normal English rules. Works analogously to {pluralize()}.


'''NOTE''': This doesn't always work as well as {pluralize()}. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
-- Check for a link. This pattern matches both piped and unpiped links.
# If word ends in -ies, replace -ies with -y.
-- If the link is not piped, the second capture (linktext) will be empty.
# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
# Otherwise, remove -s.
if link then

if linktext ~= "" then
This handles links correctly:
# If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
# If a non-piped link, singularize the link.
# A link like "[[parish]]es" will be handled correctly because the code that checks for -shes etc. allows ] characters between the
'sh' etc. and final -es.
]==]
function export.singularize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
if not link then
return do_singularize(str)
elseif linktext ~= "" then
return beginning .. collapse_link(link, do_singularize(linktext))
return beginning .. collapse_link(link, do_singularize(linktext))
end
end
return beginning .. "[[" .. do_singularize(link) .. "]]"
return beginning .. "[[" .. do_singularize(link) .. "]]"
end
end

return do_singularize(text)
end
end


--[==[

Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text.
function export.add_indefinite_article(text, uppercase)
Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with
a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.get_indefinite_article(str, ucfirst)
str = str or ""
local is_vowel = false
local is_vowel = false
-- If there's a link at the beginning, examine the first letter of the
-- If there's a link at the beginning, examine the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
if link then
if link then
is_vowel = export.find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
is_vowel = find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
else
else
is_vowel = export.find(text, "^[AEIOUaeiou]")
is_vowel = find(str, "^[AEIOUaeiou]")
end
end
return (is_vowel and (uppercase and "An " or "an ") or (uppercase and "A " or "a ")) .. text
return is_vowel and (ucfirst and "An" or "an") or (ucfirst and "A" or "a")
end
end
get_indefinite_article = export.get_indefinite_article


--[==[
-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
Prefix `text` with the appropriate indefinite article to prefix to `text`. Correctly handles links and capitalized
function export.escape_risky_characters(text)
text. Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning
-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software.
with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
if not mw.ustring.match(text, "%S") then
]==]
return mw.text.encode(text, "%s")
function export.add_indefinite_article(text, ucfirst)
else
return get_indefinite_article(text, ucfirst) .. " " .. text
return mw.text.encode(text, "!#%%&*+/:;<=>?@[\\%]_{|}")
end
end
end



ၶိုၼ်းၶူၼ်ႉၶႆႈၼင်ႇ 16:32, 26 မေႇ 2024

Provides some utility functions for manipulating strings.

Functions

export.explode_utf8

function export.explode_utf8(str)

Explodes a string into an array of UTF-8 characters. Warning: this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.

export.pattern_escape

function export.pattern_escape(str)

Escapes the magic characters used in patterns (Lua's version of regular expressions): $%()*+-.?[]^. For example, "^$()%.[]*+-?" becomes "%^%$%(%)%%%.%[%]%*%+%-%?". This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).

export.charset_escape

function export.charset_escape(str)

Escapes only the magic characters used in pattern character sets: %-]^.

export.replacement_escape

function export.replacement_escape(str)

Escapes only %, which is the only magic character used in replacement patterns with string.gsub and mw.ustring.gsub.

export.len

function export.len(str)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.sub

function export.sub(str, i, j)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.find

function export.find(str, pattern, init, plain)

A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.

export.match

function export.match(str, pattern, init)

A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.

export.gmatch

function export.gmatch(str, pattern)

A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.

export.gsub

function export.gsub(str, pattern, repl, n)

A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.

export.plain_gsub

function export.plain_gsub(str, pattern, repl, n)

Like gsub, but pattern-matching facilities are turned off, so pattern and repl (if a string) are treated as literal.

export.reverse

function export.reverse(str)

Reverses a UTF-8 string; equivalent to string.reverse.

export.char

function export.char(cp, ...)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.codepoint

function export.codepoint(str, i, j)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.gcodepoint

function export.gcodepoint(str, i, j)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.lower

function export.lower(str)

A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.

export.upper

function export.upper(str)

A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.

export.split

function export.split(str, pattern, str_lib, plain)

Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: str_lib forces use of the string library, while plain turns any pattern matching facilities off, treating pattern as literal.

export.gsplit

function export.gsplit(str, pattern, str_lib, plain)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.trim

function export.trim(str, charset)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.decode_entities

function export.decode_entities(str)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.encode_entities

function export.encode_entities(str, charset, str_lib, plain)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.decode_uri

function export.decode_uri(str, enctype)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.remove_comments

function export.remove_comments(str, stage)

Removes any HTML comments from the input text. stage can be one of three options:

  • "PRE" (default) applies the method used by MediaWiki's preprocessor: all <!-- ... --> pairs are removed, as well as any text after an unclosed <!--. This is generally suitable when parsing raw template or parser extension tag code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use Module:template parser instead).
  • "POST" applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any <!-- ... --> pairs until no more are found (e.g. <!-<!-- ... -->- ... --> would be fully removed), but any unclosed <!-- is ignored. This is suitable for handling links embedded in template inputs, where the "PRE" method will have already been applied by the native parser.
  • "BOTH" applies "PRE" then "POST".

export.php_trim

function export.php_trim(str)

Lua equivalent of PHP's trim($string), which trims "\0", "\t", "\n", "\v", "\r" and " ". This is useful when dealing with template parameters, since the native parser trims them like this.

export.scribunto_param_key

function export.scribunto_param_key(key)

Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a frame.args table). For example, "1" is normalized to 1 (a number), and " foo " is normalized to "foo". If the input is not a string, it is returned unchanged.

After being trimmed with export.php_trim, strings are converted to numbers if:

  1. They are integers, with no decimals (2.0) or leading zeroes (02).
  2. They are ≤ 253 and ≥ -253.
  3. For positive values, they do not have a leading + sign.

export.escape_bytes

function export.escape_bytes(str)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.format_fun

function export.format_fun(str, fun)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.format

function export.format(str, tbl)

This function, unlike string.format and mw.ustring.format, takes just two parameters—a format string and a table—and replaces all instances of {param_name} in the format string with the table's entry for param_name. The opening and closing brace characters can be escaped with \op and \cl, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.

Examples

  • string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})
    produces: "one fish, two fish, red fish, blue fish"
  • string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})
    produces: "The set {1, 2, 3} contains three elements."
    • Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.

export.ucfirst

function export.ucfirst(str)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.lcfirst

function export.lcfirst(str)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.capitalize

function export.capitalize(str)

Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.

export.pluralize

function export.pluralize(str)

Pluralize a word in a smart fashion, according to normal English rules.

  1. If word ends in consonant + -y, replace the -y with -ies.
  2. If the word ends in -s, -x, -z, -sh, -ch, add -es.
  3. Otherwise, add -s.
This handles links correctly:
  1. If a piped link, change the second part appropriately.
  2. If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
  3. If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.

export.singularize

function export.singularize(str)

Singularize a word in a smart fashion, according to normal English rules. Works analogously to pluralize().

NOTE: This doesn't always work as well as pluralize(). Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
  1. If word ends in -ies, replace -ies with -y.
  2. If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
  3. Otherwise, remove -s.
This handles links correctly:
  1. If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
  2. If a non-piped link, singularize the link.
  3. A link like "parishes" will be handled correctly because the code that checks for -shes etc. allows ] characters between the 'sh' etc. and final -es.

export.get_indefinite_article

function export.get_indefinite_article(str, ucfirst)

Return the appropriate indefinite article to prefix to str. Correctly handles links and capitalized text.

Does not correctly handle words like union, uniform and university that take "a" despite beginning with

a 'u'. The returned article will have its first letter capitalized if ucfirst is specified, otherwise lowercase.

export.add_indefinite_article

function export.add_indefinite_article(text, ucfirst)

Prefix text with the appropriate indefinite article to prefix to text. Correctly handles links and capitalized

text. Does not correctly handle words like union, uniform and university that take "a" despite beginning

with a 'u'. The returned article will have its first letter capitalized if ucfirst is specified, otherwise lowercase.


local mw = mw
local string = string
local table = table
local ustring = mw.ustring

local byte = string.byte
local char = string.char
local concat = table.concat
local find = string.find
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local len = string.len
local load_data = mw.loadData
local lower = string.lower
local match = string.match
local next = next
local reverse = string.reverse
local select = select
local sort = table.sort
local sub = string.sub
local tonumber = tonumber
local tostring = tostring
local type = type
local ucodepoint = ustring.codepoint
local ufind = ustring.find
local ugcodepoint = ustring.gcodepoint
local ugmatch = ustring.gmatch
local ugsub = ustring.gsub
local ulower = ustring.lower
local umatch = ustring.match
local unpack = unpack
local upper = string.upper
local usub = ustring.sub
local uupper = ustring.upper
-- Defined below.
local charset_escape
local codepoint
local explode_utf8
local format_fun
local get_indefinite_article
local pattern_escape
local pattern_simplifier
local php_trim
local replacement_escape
local u
local ulen

local module_name = "string_utilities"

local export = {}

--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
function export.explode_utf8(str)
	local text, i = {}, 0
	for ch in gmatch(str, ".[\128-\191]*") do
		i = i + 1
		text[i] = ch
	end
	return text
end
explode_utf8 = export.explode_utf8

--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{code|lua|"^$()%.[]*+-?"}} becomes {{code|lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
function export.pattern_escape(str)
	return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0"))
end
pattern_escape = export.pattern_escape

--[==[Escapes only the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>.]==]
function export.charset_escape(str)
	return (gsub(str, "[%%%-%]^]", "%%%0"))
end
charset_escape = export.charset_escape

--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
function export.replacement_escape(str)
	return (gsub(str, "%%", "%%%%"))
end
replacement_escape = export.replacement_escape

do
	local function check_sets_equal(set1, set2)
		local k2
		for k1, v1 in next, set1 do
			local v2 = set2[k1]
			if v1 ~= v2 and (v2 == nil or not check_sets_equal(v1, v2)) then
				return false
			end
			k2 = next(set2, k2)
		end
		return next(set2, k2) == nil
	end
	
	local function check_sets(bytes)
		local key, set1, set = next(bytes)
		if set1 == true then
			return true
		elseif not check_sets(set1) then
			return false
		end
		while true do
			key, set = next(bytes, key)
			if not key then
				return true
			elseif not check_sets_equal(set, set1) then
				return false
			end
		end
	end
	
	local function make_charset(range)
		if #range == 1 then
			return char(range[1])
		end
		sort(range)
		local compressed, n, start = {}, 0, range[1]
		for i = 1, #range do
			local this, nxt = range[i], range[i + 1]
			if nxt ~= this + 1 then
				n = n + 1
				compressed[n] = this == start and char(this) or
					char(start) .. "-" .. char(this)
				start = nxt
			end
		end
		return "[" .. concat(compressed) .. "]"
	end
	
	local function parse_1_byte_charset(pattern, pos)
		while true do
			local ch, nxt_pos
			pos, ch, nxt_pos = match(pattern, "()([%%%]\194-\244][\128-\191]*)()", pos)
			if not ch then
				return false
			elseif ch == "%" then
				if match(pattern, "^[acdlpsuwxACDLPSUWXZ\128-\255]", nxt_pos) then
					return false
				end
				pos = pos + 2
			elseif ch == "]" then
				pos = nxt_pos
				return pos
			else
				return false
			end
		end	
	end
	
	--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]
	pattern_simplifier = require("Module:fun").memoize(function(pattern)
		if type(pattern) == "number" then
			return tostring(pattern)
		end
		local pos, captures, start, n, output = 1, 0, 1, 0
		while true do
			local ch, nxt_pos
			pos, ch, nxt_pos = match(pattern, "()([%%(.[\194-\244][\128-\191]*)()", pos)
			if not ch then
				break
			end
			local nxt = sub(pattern, nxt_pos, nxt_pos)
			if ch == "%" then
				if nxt == "b" then
					if not match(pattern, "^()[^\128-\255][^\128-\255]", pos + 2) then
						return false
					end
					pos = pos + 4
				elseif nxt == "f" then
					pos = pos + 2
					if not match(pattern, "^()%[[^^]", pos) then
						return false
					end
					-- Only possible to convert a %f charset which is all
					-- ASCII, so use parse_1_byte_charset.
					pos = parse_1_byte_charset(pattern, pos)
					if not pos then
						return false
					end
				elseif nxt == "Z" then
					pos = pos + 2
					nxt = sub(pattern, pos, pos)
					if nxt == "*" or nxt == "+" or nxt == "-" then
						pos = pos + 1
					else
						output = output or {}
						n = n + 1
						if nxt == "?" then
							output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244]?[\128-\191]*"
							pos = pos + 1
						else
							output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244][\128-\191]*"
						end
						start = pos
					end
				elseif find("acdlpsuwxACDLPSUWX", nxt, 1, true) then
					return false
				-- Skip the next character if it's ASCII. Otherwise, we will
				-- still need to do length checks.
				else
					pos = pos + (byte(nxt) < 128 and 2 or 1)
				end
			elseif ch == "(" then
				if nxt == ")" or captures == 32 then
					return false
				end
				captures = captures + 1
				pos = pos + 1
			elseif ch == "." then
				if nxt == "*" or nxt == "+" or nxt == "-" then
					pos = pos + 2
				else
					output = output or {}
					n = n + 1
					if nxt == "?" then
						output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191]?[\128-\191]*"
						pos = pos + 2
					else
						output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191][\128-\191]*"
						pos = pos + 1
					end
					start = pos
				end
			elseif ch == "[" then
				-- Fail negative charsets. TODO: 1-byte charsets should be safe.
				if nxt == "^" then
					return false
				-- If the first character is "%", ch_len is determined by the
				-- next one instead.
				elseif nxt == "%" then
					nxt_pos = nxt_pos + 1
					nxt = sub(pattern, nxt_pos, nxt_pos)
				end
				local ch_len = #match(pattern, "^.[\128-\191]*", nxt_pos)
				if ch_len == 1 then -- Single-byte charset.
					pos = parse_1_byte_charset(pattern, pos + 1)
					if not pos then
						return false
					end
				else -- Multibyte charset.
					local charset_pos, bytes = pos
					pos = pos + 1
					while true do -- TODO: non-ASCII charset ranges.
						pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
						if not ch then
							return false
						-- If escaped, get the next character. No need to
						-- distincguish magic characters or character classes,
						-- as they'll all fail for having the wrong length
						-- anyway.
						elseif ch == "%" then
							pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
						elseif ch == "]" then
							pos = nxt_pos
							break
						end
						if ch_len ~= #ch then
							return false
						end
						bytes = bytes or {}
						local bytes = bytes
						for i = 1, ch_len - 1 do
							local b = byte(ch, i, i)
							bytes[b] = bytes[b] or {}
							bytes = bytes[b]
						end
						bytes[byte(ch, -1)] = true
						pos = nxt_pos
					end
					if not pos then
						return false
					end
					local nxt = sub(pattern, pos, pos)
					if (
						(nxt == "?" or nxt == "*" or nxt == "-") or
						(nxt == "+" and ch_len > 2) or
						not check_sets(bytes)
					) then
						return false
					end
					local ranges, b, key, next_byte = {}, 0
					repeat
						key, next_byte = next(bytes)
						local range, n = {key}, 1
						-- Loop starts on the second iteration.
						for key in next, bytes, key do
							n = n + 1
							range[n] = key
						end
						b = b + 1
						ranges[b] = range
						bytes = next_byte
					until next_byte == true
					if nxt == "+" then
						local range1, range2 = ranges[1], ranges[2]
						ranges[1] = make_charset(range1)
						ranges[3] = make_charset(range2)
						local n = #range2
						for i = 1, #range1 do
							n = n + 1
							range2[n] = range1[i]
						end
						ranges[2] = make_charset(range2) .. "*"
						pos = pos + 1
					else
						for i = 1, #ranges do
							ranges[i] = make_charset(ranges[i])
						end
					end
					output = output or {}
					n = n + 1
					output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges)
					start = pos
				end
			elseif nxt == "+" then
				if #ch ~= 2 then
					return false
				end
				output = output or {}
				n = n + 1
				output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. sub(ch, 2, 2)
				pos = nxt_pos + 1
				start = pos
			elseif nxt == "?" or nxt == "*" or nxt == "-" then
				return false
			else
				pos = nxt_pos
			end
		end
		if start == 1 then
			return pattern
		end
		return concat(output) .. sub(pattern, start)
	end, true)
	export.pattern_simplifier = pattern_simplifier -- For testing.
end

function export.len(str)
	return type(str) == "number" and len(str) or
		#str - #gsub(str, "[^\128-\191]+", "")
end
ulen = export.len

function export.sub(str, i, j)
	str, i = type(str) == "number" and tostring(str) or str, i or 1
	if i < 0 or j and j < 0 then
		return usub(str, i, j)
	elseif j and i > j or i > #str then
		return ""
	end
	local n, new_i = 0
	for loc1, loc2 in gmatch(str, "()[^\128-\191]+()[\128-\191]*") do
		n = n + loc2 - loc1
		if not new_i and n >= i then
			new_i = loc2 - (n - i) - 1
			if not j then
				return sub(str, new_i)
			end
		end
		if j and n > j then
			return sub(str, new_i, loc2 - (n - j) - 1)
		end
	end
	return new_i and sub(str, new_i) or ""
end

do
	local function _find(str, loc1, loc2, ...)
		if loc1 and not match(str, "^()[^\128-\255]*$") then
			-- Use raw values of loc1 and loc2 to get loc1 and the length of the match.
			loc1, loc2 = ulen(sub(str, 1, loc1)), ulen(sub(str, loc1, loc2))
			-- Offset length with loc1 to get loc2.
			loc2 = loc1 + loc2 - 1
		end
		return loc1, loc2, ...
	end
	
	--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
	function export.find(str, pattern, init, plain)
		init = init or 1
		if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
			return ufind(str, pattern, init, plain)
		elseif plain then
			return _find(str, find(str, pattern, init, true))
		end
		local simple = pattern_simplifier(pattern)
		if simple then
			return _find(str, find(str, simple, init))
		end
		return ufind(str, pattern, init)
	end
end

--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
function export.match(str, pattern, init)
	init = init or 1
	if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
		return umatch(str, pattern, init)
	end
	local simple = pattern_simplifier(pattern)
	if simple then
		return match(str, simple, init)
	end
	return umatch(str, pattern, init)
end

--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
function export.gmatch(str, pattern)
	local simple = pattern_simplifier(pattern)
	if simple then
		return gmatch(str, simple)
	end
	return ugmatch(str, pattern)
end

--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
function export.gsub(str, pattern, repl, n)
	local simple = pattern_simplifier(pattern)
	if simple then
		return gsub(str, simple, repl, n)
	end
	return ugsub(str, pattern, repl, n)
end

--[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==]
function export.plain_gsub(str, pattern, repl, n)
	return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
end

--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]
function export.reverse(str)
	return reverse(gsub(str, "[\194-\244][\128-\191]*", reverse))
end

do
	local function err(cp)
		error("Codepoint " .. cp .. " is out of range: codepoints must be between 0x0 and 0x10FFFF.", 2)
	end

	local function utf8_char(cp)
		cp = tonumber(cp)
		if cp < 0 then
			err("-0x" .. format("%X", -cp + 1))
		elseif cp < 0x80 then
			return char(cp)
		elseif cp < 0x800 then
			return char(
				0xC0 + cp / 0x40,
				0x80 + cp % 0x40
			)
		elseif cp < 0x10000 then
			if cp >= 0xD800 and cp < 0xE000 then
				return "?" -- mw.ustring.char returns "?" for surrogates.
			end
			return char(
				0xE0 + cp / 0x1000,
				0x80 + cp / 0x40 % 0x40,
				0x80 + cp % 0x40
			)
		elseif cp < 0x110000 then
			return char(
				0xF0 + cp / 0x40000,
				0x80 + cp / 0x1000 % 0x40,
				0x80 + cp / 0x40 % 0x40,
				0x80 + cp % 0x40
			)
		end
		err("0x" .. format("%X", cp))
	end

	function export.char(cp, ...)
		if ... == nil then
			return utf8_char(cp)
		end
		local ret = {cp, ...}
		for i = 1, select("#", cp, ...) do
			ret[i] = utf8_char(ret[i])
		end
		return concat(ret)
	end
	u = export.char
end

do
	local function get_codepoint(b1, b2, b3, b4)
		if b1 < 128 then
			return b1, 1
		elseif b1 < 224 then
			return 0x40 * b1 + b2 - 0x3080, 2
		elseif b1 < 240 then
			return 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080, 3
		end
		return 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080, 4
	end

	function export.codepoint(str, i, j)
		if type(str) == "number" then
			return byte(str, i, j)
		end
		i, j = i or 1, j == -1 and #str or i or 1
		if i == 1 and j == 1 then
			return (get_codepoint(byte(str, 1, 4)))
		elseif i < 0 or j < 0 then
			return ucodepoint(str, i, j) -- FIXME
		end
		local n, nb, ret, nr = 0, 1, {}, 0
		while n < j do
			n = n + 1
			if n < i then
				local b = byte(str, nb)
				nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
			else
				local b1, b2, b3, b4 = byte(str, nb, nb + 3)
				if not b1 then
					break
				end
				nr = nr + 1
				local add
				ret[nr], add = get_codepoint(b1, b2, b3, b4)
				nb = nb + add
			end
		end
		return unpack(ret)
	end
	codepoint = export.codepoint
	
	function export.gcodepoint(str, i, j)
		i, j = i or 1, j ~= -1 and j or nil
		if i < 0 or j and j < 0 then
			return ugcodepoint(str, i, j) -- FIXME
		end
		local n, nb = 1, 1
		while n < i do
			local b = byte(str, nb)
			if not b then
				break
			end
			nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
			n = n + 1
		end
		
		return function()
			if j and n > j then
				return nil
			end
			n = n + 1
			local b1, b2, b3, b4 = byte(str, nb, nb + 3)
			if not b1 then
				return nil
			end
			local ret, add = get_codepoint(b1, b2, b3, b4)
			nb = nb + add
			return ret
		end
	end
end

--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
function export.lower(str)
	return (match(str, "^()[^\128-\255]*$") and lower or ulower)(str)
end

--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
function export.upper(str)
	return (match(str, "^()[^\128-\255]*$") and upper or uupper)(str)
end

do
	local function add_captures(text, n, ...)
		-- Insert any captures from the splitting pattern.
		local offset, capture = n - 1, ...
		while capture do
			n = n + 1
			text[n] = capture
			capture = select(n - offset, ...)
		end
		return n
	end
	
	local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...)
		if not (loc1 and start <= str_len) then
			-- If no match, or there is but we're past the end of the string
			-- (which happens when the match is the empty string), then add
			-- the final chunk and return.
			n = n + 1
			text[n] = _sub(str, start)
			return
		elseif loc2 < loc1 then
			-- Special case: If we match the empty string, then include the
			-- next character; this avoids an infinite loop, and makes
			-- splitting by an empty string work the way mw.text.split() does
			-- (including non-adjacent empty string matches with %f). If we
			-- reach the end of the string this way, return immediately, so we
			-- don't get a final empty string. If using the string library, we
			-- need to make sure we advance by one UTF-8 character.
			if _sub == sub then
				loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
			end
			n = n + 1
			text[n] = _sub(str, start, loc1)
			start = loc1 + 1
			if start > str_len then
				return ... and add_captures(text, n, ...) or n
			end
		else
			-- Add chunk up to the current match.
			n = n + 1
			text[n] = _sub(str, start, loc1 - 1)
			start = loc2 + 1
		end
		return (... and add_captures(text, n, ...) or n), start
	end
	
	local function _split(str, pattern, str_len, _sub, _find, plain)
		local text, n, start = {}, 0, 1
		
		repeat
			n, start = iterate(str, str_len, text, n, start, _sub, _find(str, pattern, start, plain))
		until not start
		
		return text
	end
	
	--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==]
	function export.split(str, pattern, str_lib, plain)
		if str_lib or plain then
			return _split(str, pattern, #str, sub, find, plain)
		end
		local simple = pattern_simplifier(pattern)
		if simple then
			return _split(str, simple, #str, sub, find)
		end
		return _split(str, pattern, ulen(str), usub, ufind)
	end
	export.capturing_split = export.split -- To be removed.
end

do
	-- TODO: merge this with export.split. Not clear how to do this while
	-- maintaining the same level of performance, as gsplit is slower.
	local function _split(str, pattern, str_len, _sub, _find, plain)
		local start, final = 1
		
		local function iter(loc1, loc2, ...)
			-- If no match, return the final chunk.
			if not loc1 then
				final = true
				return _sub(str, start)
			end
			-- Special case: If we match the empty string, then eat the
			-- next character; this avoids an infinite loop, and makes
			-- splitting by the empty string work the way mw.text.gsplit() does
			-- (including non-adjacent empty string matches with %f). If we
			-- reach the end of the string this way, set `final` to true, so we
			-- don't get stuck matching the empty string at the end.
			local chunk
			if loc2 < loc1 then
				-- If using the string library, we need to make sure we advance
				-- by one UTF-8 character.
				if _sub == sub then
					loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
				end
				chunk = _sub(str, start, loc1)
				if loc1 >= str_len then
					final = true
				else
					start = loc1 + 1
				end
			-- Eat chunk up to the current match.
			else
				chunk = _sub(str, start, loc1 - 1)
				start = loc2 + 1
			end
			return chunk, ...
		end
		
		return function()
			if not final then
				return iter(_find(str, pattern, start, plain))
			end
			return nil
		end
	end
	
	function export.gsplit(str, pattern, str_lib, plain)
		if str_lib or plain then
			return _split(str, pattern, #str, sub, find, plain)
		end
		local simple = pattern_simplifier(pattern)
		if simple then
			return _split(str, simple, #str, sub, find)
		end
		return _split(str, pattern, ulen(str), usub, ufind)
	end
end

function export.trim(str, charset)
	if not charset then
		return match(str, "^()%s*$") and "" or match(str, "^%s*(.*%S)")
	elseif match(charset, "^()[^\128-\255]*$") then
		return match(str, "^()[" .. charset .. "]*$") and "" or match(str, "^[" .. charset .. "]*(.*[^" .. charset .. "])")
	end
	return umatch(str, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$")
end

do
	local entities

	local function decode_numeric_entity(code, pattern, base)
		local cp = match(code, pattern) and tonumber(code, base)
		return cp and cp < 0x110000 and u(cp) or nil
	end

	local function decode_entity(hash, x, code)
		if hash == "#" then
			return x == "" and decode_numeric_entity(code, "^%d+$") or
				decode_numeric_entity(code, "^%x+$", 16)
		end
		entities = entities or load_data("Module:data/entities")
		return entities[x .. code]
	end

	-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].
	function export.decode_entities(str)
		return find(str, "&", 1, true) and
			gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
	end
end

do
	local html_entities
	
	local function encode_entity(ch)
		local entity = html_entities[ch]
		if entity then
			return entity
		end
		entity = "&#" .. codepoint(ch) .. ";"
		html_entities[ch] = entity
		return entity
	end
	
	function export.encode_entities(str, charset, str_lib, plain)
		-- Memoized HTML entities (taken from mw.text.lua).
		html_entities = html_entities or {
			["\""] = "&quot;",
			["&"] = "&amp;",
			["'"] = "&#039;",
			["<"] = "&lt;",
			[">"] = "&gt;",
			["\194\160"] = "&nbsp;",
		}
		if not charset then
			return (gsub(str, "[\"&'<>\194]\160?", html_entities))
		elseif plain then
			return (gsub(str, "[" .. charset_escape(charset) .. "]", encode_entity))
		elseif str_lib then
			if not match(charset, "^()[^\128-\255]*$") then
				error("Cannot use the string library with a character set that contains a character with a codepoint above U+007F.")
			end
			return (gsub(str, "[" .. charset .. "]", encode_entity))
		end
		local pattern = charset and "[" .. charset .. "]"
		local simple = pattern_simplifier(pattern)
		if simple then
			return (gsub(str, simple, encode_entity))
		end
		return (ugsub(str, pattern, encode_entity))
	end
end

do
	local function decode_path(code)
		return char(tonumber(code, 16))
	end
	
	local function decode(lead, trail)
		if lead == "+" or lead == "_" then
			return " " .. trail
		elseif #trail == 2 then
			return decode_path(trail)
		end
		return lead .. trail
	end
	
	function export.decode_uri(str, enctype)
		enctype = enctype and upper(enctype) or "QUERY"
		if enctype == "PATH" then
			return find(str, "%", 1, true) and
				gsub(str, "%%(%x%x)", decode_path) or str
		elseif enctype == "QUERY" then
			return (find(str, "%", 1, true) or find(str, "+", 1, true)) and
				gsub(str, "([%%%+])(%x?%x?)", decode) or str
		elseif enctype == "WIKI" then
			return (find(str, "%", 1, true) or find(str, "_", 1, true)) and
				gsub(str, "([%%_])(%x?%x?)", decode) or str
		end
		error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)
	end
end

do
	local function _remove_comments(str, pre)
		local head = find(str, "<!--", 1, true)
		if not head then
			return str
		end
		local ret, n = {sub(str, 1, head - 1)}, 1
		while true do
			local loc = find(str, "-->", head + 4, true)
			if not loc then
				return pre and concat(ret) or
					concat(ret) .. sub(str, head)
			end
			head = loc + 3
			loc = find(str, "<!--", head, true)
			if not loc then
				return concat(ret) .. sub(str, head)
			end
			n = n + 1
			ret[n] = sub(str, head, loc - 1)
			head = loc
		end
	end
	
	--[==[Removes any HTML comments from the input text. `stage` can be one of three options:
	* {{lua|"PRE"}} (default) applies the method used by MediaWiki's preprocessor: all {{code||<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code||<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
	* {{lua|"POST"}} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code||<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code||<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code||<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {{lua|"PRE"}} method will have already been applied by the native parser.
	* {{lua|"BOTH"}} applies {{lua|"PRE"}} then {{lua|"POST"}}.]==]
	function export.remove_comments(str, stage)
		if not stage or stage == "PRE" then
			return _remove_comments(str, true)
		end
		local processed = stage == "POST" and _remove_comments(str) or
			stage == "BOTH" and _remove_comments(str, true) or
			error("bad argument #2 to \"remove_comments\" (expected PRE, POST, or BOTH)", 2)
		while processed ~= str do
			str = processed
			processed = _remove_comments(str)
		end
		return str
	end
end

--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{code|lua|"\0"}}, {{code|lua|"\t"}}, {{code|lua|"\n"}}, {{code|lua|"\v"}}, {{code|lua|"\r"}} and {{code|lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
function export.php_trim(str)
	return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""
end
php_trim = export.php_trim

--[==[Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{code|lua|frame.args}} table). For example, {{code|lua|"1"}} is normalized to {{code|lua|1}} (a number), and {{code|lua|" foo "}} is normalized to {{code|lua|"foo"}}. If the input is not a string, it is returned unchanged.

After being trimmed with {{code|lua|export.php_trim}}, strings are converted to numbers if:
# They are integers, with no decimals (2.0) or leading zeroes (02).
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
# For positive values, they do not have a leading {{code|lua|+}} sign.]==]
function export.scribunto_param_key(key)
	if type(key) ~= "string" then
		return key
	end
	key = php_trim(key)
	if match(key, "^-?[1-9]%d*$") then
		local num = tonumber(key)
		-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true.
		return (
			num <= 9007199254740991 and num >= -9007199254740991 or
			key == "9007199254740992" or
			key == "-9007199254740992"
		) and num or key
	elseif key == "0" then
		return 0
	end
	return key
end

do
	local byte_escapes
	
	local function escape_byte(b)
		return byte_escapes[b] or format("\\%03d", byte(b))
	end
	
	function export.escape_bytes(str)
		byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes
		return (gsub(str, ".", escape_byte))
	end
end

function export.format_fun(str, fun)
	return (gsub(str, "{(\\?)((\\?)[^{}]*)}", function(p1, name, p2)
		if #p1 + #p2 == 1 then
			return name == "op" and "{" or
				name == "cl" and "}" or
				error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
		elseif fun(name) and type(fun(name)) ~= "string" then
			error(module_name .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
		end
		return fun(name) or error(module_name .. ".format: \"" .. name .. "\" not found in table")
	end))
end
format_fun = export.format_fun

--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
====Examples====
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
	return format_fun(str, function(key)
		return tbl[key]
	end)
end

do
	local function do_uclcfirst(str, case_func)
		-- Actual function to re-case of the first letter.
		local first_letter = case_func(match(str, "^.[\128-\191]*") or "")
		return first_letter .. sub(str, #first_letter + 1)
	end
	
	local function uclcfirst(str, case_func)
		-- If there's a link at the beginning, re-case the first letter of the
		-- link text. This pattern matches both piped and unpiped links.
		-- If the link is not piped, the second capture (linktext) will be empty.
		local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
		if link then
			return "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
		end
		return do_uclcfirst(str, case_func)
	end
	
	function export.ucfirst(str)
		return uclcfirst(str, uupper)
	end

	function export.lcfirst(str)
		return uclcfirst(str, ulower)
	end
	
	local function capitalize(w)
		return uclcfirst(w, uupper)
	end
	
	--[==[Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.]==]
	function export.capitalize(str)
		if type(str) == "table" then
			-- allow calling from a template
			str = str.args[1]
		end
		-- Capitalize multi-word that is separated by spaces
		-- by uppercasing the first letter of each part.
		-- I assume nobody will input all CAP text.
		return (ugsub(str, "%S+", capitalize))
	end
end

do
	local function word_ends_in_consonant_plus_y(str)
		-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
		-- apply to proper nouns, hence "the Gettys", "the public Ivys".
		-- We should maybe consider applying this rule here; but it may not
		-- be important as this function is almost always called on common nouns
		-- (e.g. parts of speech, place types).
		return find(str, "[^aeiouAEIOU ]y$")
	end
	
	local function word_takes_es_plural(str)
		return find(str, "[sxz]$") or find(str, "[cs]h$")
	end
	
	local function do_pluralize(str)
		if word_ends_in_consonant_plus_y(str) then
			-- avoid returning multiple values
			return (gsub(str, "y$", "ies"))
		elseif word_takes_es_plural(str) then
			return str .. "es"
		end
		return str .. "s"
	end	
	
	--[==[
	Pluralize a word in a smart fashion, according to normal English rules.
	# If word ends in consonant + -y, replace the -y with -ies.
	# If the word ends in -s, -x, -z, -sh, -ch, add -es.
	# Otherwise, add -s.

	This handles links correctly:
	# If a piped link, change the second part appropriately.
	# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
	# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
	]==]
	function export.pluralize(str)
		if type(str) == "table" then
			-- allow calling from a template
			str = str.args[1]
		end
		-- Check for a link. This pattern matches both piped and unpiped links.
		-- If the link is not piped, the second capture (linktext) will be empty.
		local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
		if not link then
			return do_pluralize(str)
		elseif linktext ~= "" then
			return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
		elseif word_ends_in_consonant_plus_y(link) then
			return beginning .. "[[" .. link .. "|" .. gsub(link, "y$", "ies") .. "]]"
		end
		return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
	end
end

do
	local function do_singularize(str)
		local sing = match(str, "^(.-)ies$")
		if sing then
			return sing .. "y"
		end
		-- Handle cases like "[[parish]]es"
		return match(str, "^(.-[sc]h%]*)es$") or
		-- Handle cases like "[[box]]es"
			match(str, "^(.-x%]*)es$") or
		-- Handle regular plurals
			match(str, "^(.-)s$") or
		-- Otherwise, return input
			str
	end
	
	local function collapse_link(link, linktext)
		if link == linktext then
			return "[[" .. link .. "]]"
		end
		return "[[" .. link .. "|" .. linktext .. "]]"
	end
	
	--[==[
	Singularize a word in a smart fashion, according to normal English rules. Works analogously to {pluralize()}.

	'''NOTE''': This doesn't always work as well as {pluralize()}. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
	# If word ends in -ies, replace -ies with -y.
	# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
	# Otherwise, remove -s.

	This handles links correctly:
	# If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
	# If a non-piped link, singularize the link.
	# A link like "[[parish]]es" will be handled correctly because the code that checks for -shes etc. allows ] characters between the
	  'sh' etc. and final -es.
	]==]
	function export.singularize(str)
		if type(str) == "table" then
			-- allow calling from a template
			str = str.args[1]
		end
		-- Check for a link. This pattern matches both piped and unpiped links.
		-- If the link is not piped, the second capture (linktext) will be empty.
		local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
		if not link then
			return do_singularize(str)
		elseif linktext ~= "" then
			return beginning .. collapse_link(link, do_singularize(linktext))
		end
		return beginning .. "[[" .. do_singularize(link) .. "]]"
	end
end

--[==[
Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text.
Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with
a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.get_indefinite_article(str, ucfirst)
	str = str or ""
	local is_vowel = false
	-- If there's a link at the beginning, examine the first letter of the
	-- link text. This pattern matches both piped and unpiped links.
	-- If the link is not piped, the second capture (linktext) will be empty.
	local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
	if link then
		is_vowel = find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
	else
		is_vowel = find(str, "^[AEIOUaeiou]")
	end
	return is_vowel and (ucfirst and "An" or "an") or (ucfirst and "A" or "a")
end
get_indefinite_article = export.get_indefinite_article

--[==[
Prefix `text` with the appropriate indefinite article to prefix to `text`. Correctly handles links and capitalized
text. Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning
with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.add_indefinite_article(text, ucfirst)
	return get_indefinite_article(text, ucfirst) .. " " .. text
end

return export