Jump to content

မေႃႇၵျူး:string utilities: လွင်ႈပႅၵ်ႇပိူင်ႈ ၼႂ်းၵႄႈ လွင်ႈၶူၼ်ႉၶႆႈ

လုၵ်ႉတီႈ ဝိၵ်ႇသျိၼ်ႇၼရီႇ မႃး
Content deleted Content added
No edit summary
Tag: Reverted
Sai Myo Thura Kyaw (ဢုပ်ႇဢူဝ်း) ၵေႃႉ ၶိုၼ်ၶိုၼ်း လွင်ႈၶူၼ်ႉၶႆႈ 101075
Tags: Undo Reverted
ထႅဝ် 1: ထႅဝ် 1:
local mw = mw
local string = string
local table = table
local ustring = mw.ustring

local byte = string.byte
local char = string.char
local concat = table.concat
local find = string.find
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local len = string.len
local load_data = mw.loadData
local lower = string.lower
local match = string.match
local next = next
local reverse = string.reverse
local select = select
local sort = table.sort
local sub = string.sub
local tonumber = tonumber
local tostring = tostring
local type = type
local ucodepoint = ustring.codepoint
local ufind = ustring.find
local ugcodepoint = ustring.gcodepoint
local ugmatch = ustring.gmatch
local ugsub = ustring.gsub
local ulower = ustring.lower
local umatch = ustring.match
local unpack = unpack
local upper = string.upper
local usub = ustring.sub
local uupper = ustring.upper
-- Defined below.
local charset_escape
local codepoint
local explode_utf8
local format_fun
local get_indefinite_article
local pattern_escape
local pattern_simplifier
local php_trim
local replacement_escape
local u
local ulen

local module_name = "string_utilities"
local module_name = "string_utilities"

local export = {}
local export = {}


local format_escapes = {
--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
["op"] = "{",
function export.explode_utf8(str)
local text, i = {}, 0
["cl"] = "}",
}
for ch in gmatch(str, ".[\128-\191]*") do
i = i + 1
text[i] = ch
end
return text
end
explode_utf8 = export.explode_utf8


function export.format_fun(str, fun)
--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{code|lua|"^$()%.[]*+-?"}} becomes {{code|lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
return (str:gsub("{(\\?)((\\?)[^{}]*)}", function (p1, name, p2)
function export.pattern_escape(str)
if #p1 + #p2 == 1 then
return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0"))
return format_escapes[name] or error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
else
if fun(name) and type(fun(name)) ~= "string" then
error(module_name .. ".format: '" .. name .. "' is a " .. type(fun(name)) .. ", not a string")
end
return fun(name) or error(module_name .. ".format: '" .. name .. "' not found in table")
end
end))
end
end
pattern_escape = export.pattern_escape


--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
--[==[Escapes only the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>.]==]
====Examples====
function export.charset_escape(str)
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
return (gsub(str, "[%%%-%]^]", "%%%0"))
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
return export.format_fun(str, function (key) return tbl[key] end)
end
end
charset_escape = export.charset_escape


--[==[Explodes a string into an array of UTF8 characters. '''Warning''': this function has no safety checks for non-UTF8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
function export.replacement_escape(str)
function export.explode_utf8(str)
local byte = string.byte
return (gsub(str, "%%", "%%%%"))
local sub = string.sub
end
replacement_escape = export.replacement_escape
local str_len = #str

local text = {}
do
local function check_sets_equal(set1, set2)
local n, i, b = 1, 0
local k2
while n <= str_len do
for k1, v1 in next, set1 do
b = byte(str, n)
local v2 = set2[k1]
i = i + 1
if v1 ~= v2 and (v2 == nil or not check_sets_equal(v1, v2)) then
if b < 0xC0 then
return false
text[i] = sub(str, n, n)
end
k2 = next(set2, k2)
n = n + 1
elseif b < 0xE0 then
text[i] = sub(str, n, n + 1)
n = n + 2
elseif b < 0xF0 then
text[i] = sub(str, n, n + 2)
n = n + 3
else
text[i] = sub(str, n, n + 3)
n = n + 4
end
end
return next(set2, k2) == nil
end
end
return text
local function check_sets(bytes)
end
local key, set1, set = next(bytes)

if set1 == true then
-- A helper function which takes a string, position and type ("byte" or "char"), and returns the equivalent position for the other type (e.g. iterate_utf8("字典", 2, "char") returns 4, because character 2 of "字典" begins with byte 4). `pos` can be positive or negative, and the function will iterate over the string forwards or backwards (respectively) until it reaches the input position. Checks byte-by-byte; skipping over trailing bytes, and then calculating the correct byte trail for any leading bytes (i.e. how many trailing bytes should follow); these trailing bytes are then checked together.
return true
-- The optional parameters `init_from_type` and `init_to_type` can be used to start part-way through an iteration to improve performance, if multiple values need to be returned from the same string. For example, iterate_utf8("слова́рь", 11, "byte", 5, 3) will begin checking at byte 5/the start of character 3. Note: The function won't check if these values match each other (as the only way to do this would be to run the iteration from the beginning), so mismatched values will return incorrect results.
elseif not check_sets(set1) then
local function iterate_utf8(text, pos, from_type, init_from_type, init_to_type)
return false
-- Position 0 is always valid and never changes.
end
if pos == 0 then
while true do
return pos
key, set = next(bytes, key)
if not key then
return true
elseif not check_sets_equal(set, set1) then
return false
end
end
end
end
local to_type
local function make_charset(range)
if #range == 1 then
if from_type == "char" then
to_type = "byte"
return char(range[1])
else
end
to_type = "char"
sort(range)
local compressed, n, start = {}, 0, range[1]
for i = 1, #range do
local this, nxt = range[i], range[i + 1]
if nxt ~= this + 1 then
n = n + 1
compressed[n] = this == start and char(this) or
char(start) .. "-" .. char(this)
start = nxt
end
end
return "[" .. concat(compressed) .. "]"
end
end
-- Positive positions iterate forwards; negative positions iterate backwards.
local function parse_1_byte_charset(pattern, pos)
local iterate_val
while true do
if pos > 0 then
local ch, nxt_pos
iterate_val = 1
pos, ch, nxt_pos = match(pattern, "()([%%%]\194-\244][\128-\191]*)()", pos)
else
if not ch then
iterate_val = -1
return false
elseif ch == "%" then
if match(pattern, "^[acdlpsuwxACDLPSUWXZ\128-\255]", nxt_pos) then
return false
end
pos = pos + 2
elseif ch == "]" then
pos = nxt_pos
return pos
else
return false
end
end
end
end
-- Adjust init_from_type and init_to_type to the iteration before, so that matches for the position given by them will work.
--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]
local trail, cp, min, b = 0
pattern_simplifier = require("Module:fun").memoize(function(pattern)
local c, leading_byte = {}
if type(pattern) == "number" then
c[from_type] = init_from_type and init_from_type ~= 0 and init_from_type - iterate_val or 0
return tostring(pattern)
c[to_type] = init_to_type and init_to_type ~= 0 and init_to_type - iterate_val or 0
while true do
if pos > 0 then
b = text:byte(c.byte + 1)
else
b = text:byte(text:len() + c.byte)
end
end
-- Position byte doesn't exist, so iterate the return value and return it.
local pos, captures, start, n, output = 1, 0, 1, 0
while true do
if not b then
return c[to_type] + iterate_val
local ch, nxt_pos
elseif b < 0x80 then
pos, ch, nxt_pos = match(pattern, "()([%%(.[\194-\244][\128-\191]*)()", pos)
-- 1-byte codepoint, 00-7F.
if not ch then
break
trail = 0
cp = b
min = 0
leading_byte = true
elseif b < 0xc0 then
-- A trailing byte.
leading_byte = false
elseif b < 0xc2 then
-- An overlong encoding for a 1-byte codepoint.
error("String " .. text .. " is not UTF-8.")
elseif b < 0xe0 then
-- 2-byte codepoint, C2-DF.
trail = 1
cp = b - 0xc0
min = 0x80
leading_byte = true
elseif b < 0xf0 then
-- 3-byte codepoint, E0-EF.
trail = 2
cp = b - 0xe0
min = 0x800
leading_byte = true
elseif b < 0xf4 then
-- 4-byte codepoint, F0-F3.
trail = 3
cp = b - 0xf0
min = 0x10000
leading_byte = true
elseif b == 0xf4 then
-- 4-byte codepoint, F4.
-- Make sure it doesn't decode to over U+10FFFF.
if text:byte(c.byte + 2) > 0x8f then
error("String " .. text .. " is not UTF-8.")
end
end
trail = 3
local nxt = sub(pattern, nxt_pos, nxt_pos)
if ch == "%" then
cp = 4
if nxt == "b" then
min = 0x100000
leading_byte = true
if not match(pattern, "^()[^\128-\255][^\128-\255]", pos + 2) then
else
return false
-- Codepoint over U+10FFFF, or invalid byte.
end
error("String " .. text .. " is not UTF-8.")
pos = pos + 4
end
elseif nxt == "f" then
pos = pos + 2
-- Check subsequent bytes for multibyte codepoints.
if not match(pattern, "^()%[[^^]", pos) then
if leading_byte then
return false
local from, to
end
if pos > 0 then
-- Only possible to convert a %f charset which is all
from, to = c.byte + 2, c.byte + 1 + trail
-- ASCII, so use parse_1_byte_charset.
pos = parse_1_byte_charset(pattern, pos)
if not pos then
return false
end
elseif nxt == "Z" then
pos = pos + 2
nxt = sub(pattern, pos, pos)
if nxt == "*" or nxt == "+" or nxt == "-" then
pos = pos + 1
else
output = output or {}
n = n + 1
if nxt == "?" then
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244]?[\128-\191]*"
pos = pos + 1
else
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244][\128-\191]*"
end
start = pos
end
elseif find("acdlpsuwxACDLPSUWX", nxt, 1, true) then
return false
-- Skip the next character if it's ASCII. Otherwise, we will
-- still need to do length checks.
else
pos = pos + (byte(nxt) < 128 and 2 or 1)
end
elseif ch == "(" then
if nxt == ")" or captures == 32 then
return false
end
captures = captures + 1
pos = pos + 1
elseif ch == "." then
if nxt == "*" or nxt == "+" or nxt == "-" then
pos = pos + 2
else
output = output or {}
n = n + 1
if nxt == "?" then
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191]?[\128-\191]*"
pos = pos + 2
else
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191][\128-\191]*"
pos = pos + 1
end
start = pos
end
elseif ch == "[" then
-- Fail negative charsets. TODO: 1-byte charsets should be safe.
if nxt == "^" then
return false
-- If the first character is "%", ch_len is determined by the
-- next one instead.
elseif nxt == "%" then
nxt_pos = nxt_pos + 1
nxt = sub(pattern, nxt_pos, nxt_pos)
end
local ch_len = #match(pattern, "^.[\128-\191]*", nxt_pos)
if ch_len == 1 then -- Single-byte charset.
pos = parse_1_byte_charset(pattern, pos + 1)
if not pos then
return false
end
else -- Multibyte charset.
local charset_pos, bytes = pos
pos = pos + 1
while true do -- TODO: non-ASCII charset ranges.
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
if not ch then
return false
-- If escaped, get the next character. No need to
-- distincguish magic characters or character classes,
-- as they'll all fail for having the wrong length
-- anyway.
elseif ch == "%" then
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
elseif ch == "]" then
pos = nxt_pos
break
end
if ch_len ~= #ch then
return false
end
bytes = bytes or {}
local bytes = bytes
for i = 1, ch_len - 1 do
local b = byte(ch, i, i)
bytes[b] = bytes[b] or {}
bytes = bytes[b]
end
bytes[byte(ch, -1)] = true
pos = nxt_pos
end
if not pos then
return false
end
local nxt = sub(pattern, pos, pos)
if (
(nxt == "?" or nxt == "*" or nxt == "-") or
(nxt == "+" and ch_len > 2) or
not check_sets(bytes)
) then
return false
end
local ranges, b, key, next_byte = {}, 0
repeat
key, next_byte = next(bytes)
local range, n = {key}, 1
-- Loop starts on the second iteration.
for key in next, bytes, key do
n = n + 1
range[n] = key
end
b = b + 1
ranges[b] = range
bytes = next_byte
until next_byte == true
if nxt == "+" then
local range1, range2 = ranges[1], ranges[2]
ranges[1] = make_charset(range1)
ranges[3] = make_charset(range2)
local n = #range2
for i = 1, #range1 do
n = n + 1
range2[n] = range1[i]
end
ranges[2] = make_charset(range2) .. "*"
pos = pos + 1
else
for i = 1, #ranges do
ranges[i] = make_charset(ranges[i])
end
end
output = output or {}
n = n + 1
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges)
start = pos
end
elseif nxt == "+" then
if #ch ~= 2 then
return false
end
output = output or {}
n = n + 1
output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. sub(ch, 2, 2)
pos = nxt_pos + 1
start = pos
elseif nxt == "?" or nxt == "*" or nxt == "-" then
return false
else
else
from, to = text:len() + c.byte + 1, text:len() + c.byte + trail
pos = nxt_pos
end
end
for trailing_byte = from, to do
end
b = text:byte(trailing_byte)
if start == 1 then
if not b or b < 0x80 or b > 0xbf then
return pattern
error("String " .. text .. " is not UTF-8.")
end
end
return concat(output) .. sub(pattern, start)
cp = cp * 0x40 + b - 0x80
end, true)
end
export.pattern_simplifier = pattern_simplifier -- For testing.
local next_byte = text:byte(to + 1)
end
if next_byte and next_byte >= 0x80 and next_byte <= 0xbf then

-- Too many trailing bytes.
function export.len(str)
error("String " .. text .. " is not UTF-8.")
return type(str) == "number" and len(str) or
elseif cp < min then
#str - #gsub(str, "[^\128-\191]+", "")
-- Overlong encoding.
end
error("String " .. text .. " is not UTF-8.")
ulen = export.len

function export.sub(str, i, j)
str, i = type(str) == "number" and tostring(str) or str, i or 1
if i < 0 or j and j < 0 then
return usub(str, i, j)
elseif j and i > j or i > #str then
return ""
end
local n, new_i = 0
for loc1, loc2 in gmatch(str, "()[^\128-\191]+()[\128-\191]*") do
n = n + loc2 - loc1
if not new_i and n >= i then
new_i = loc2 - (n - i) - 1
if not j then
return sub(str, new_i)
end
end
end
end
if j and n > j then
c.byte = c.byte + iterate_val
return sub(str, new_i, loc2 - (n - j) - 1)
if leading_byte then
c.char = c.char + iterate_val
end
end
end
if c[from_type] == pos then
return new_i and sub(str, new_i) or ""
return c[to_type]
end

do
local function _find(str, loc1, loc2, ...)
if loc1 and not match(str, "^()[^\128-\255]*$") then
-- Use raw values of loc1 and loc2 to get loc1 and the length of the match.
loc1, loc2 = ulen(sub(str, 1, loc1)), ulen(sub(str, loc1, loc2))
-- Offset length with loc1 to get loc2.
loc2 = loc1 + loc2 - 1
end
end
return loc1, loc2, ...
end
--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
function export.find(str, pattern, init, plain)
init = init or 1
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
return ufind(str, pattern, init, plain)
elseif plain then
return _find(str, find(str, pattern, init, true))
end
local simple = pattern_simplifier(pattern)
if simple then
return _find(str, find(str, simple, init))
end
return ufind(str, pattern, init)
end
end
end
end


--[==[Converts a character position to the equivalent byte position.]==]
--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
function export.match(str, pattern, init)
function export.charsToBytes(text, pos)
return iterate_utf8(text, pos, "char")
init = init or 1
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
return umatch(str, pattern, init)
end
local simple = pattern_simplifier(pattern)
if simple then
return match(str, simple, init)
end
return umatch(str, pattern, init)
end
end


--[==[Converts a byte position to the equivalent character position.]==]
--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
function export.gmatch(str, pattern)
function export.bytesToChars(text, pos)
local simple = pattern_simplifier(pattern)
local byte = text:byte(pos)
if byte and byte >= 0x80 and byte <= 0xbf then
if simple then
error("Byte " .. pos .. " is not a leading byte.")
return gmatch(str, simple)
end
end
return ugmatch(str, pattern)
return iterate_utf8(text, pos, "byte")
end
end


-- A helper function which iterates through a pattern, and returns two values: a potentially modified version of the pattern, and a boolean indicating whether the returned pattern is simple (i.e. whether it can be used with the stock string library); if not, then the pattern is complex (i.e. it must be used with the ustring library, which is much more resource-intensive).
--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
function export.gsub(str, pattern, repl, n)
local function patternSimplifier(text, pattern, plain)
local simple = pattern_simplifier(pattern)
pattern = tostring(pattern)
-- If `plain` is set, then the pattern is treated as literal (so is always simple). Only used by find.
if simple then
if plain then
return gsub(str, simple, repl, n)
return pattern, true
--If none of these are present, then the pattern has to be simple.
elseif not (
pattern:match("%[.-[\128-\255].-%]") or
pattern:match("[\128-\255][%*%+%?%-]") or
pattern:match("%%[abcdlpsuwxACDLPSUWXZ]") or
pattern:match("%[%^[^%]]+%]") or
pattern:match("%.[^%*%+%-]") or
pattern:match("%.$") or
pattern:match("%%b.?[\128-\255]") or
pattern:match("()", 1, true)
) then
return pattern, true
end
end
return ugsub(str, pattern, repl, n)
-- Otherwise, the pattern could go either way.
-- Build up the new pattern in a table, then concatenate at the end. we do it this way, as occasionally entries get modified along the way.
end
local new_pattern = {}

local len, pos, b = pattern:len(), 0
--[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==]
local char, next_char
function export.plain_gsub(str, pattern, repl, n)
return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
-- `escape` and `balanced` are counters, which ensure the effects of % or %b (respectively) are distributed over the following bytes.
end
-- `set` is a boolean that states whether the current byte is in a charset.

-- `capture` keeps track of how many layers of capture groups the position is in, while `captures` keeps a tally of how many groups have been detected (due to the string library limit of 32).
--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]
local escape, set, balanced, capture, captures = 0, false, 0, 0, 0
function export.reverse(str)
return reverse(gsub(str, "[\194-\244][\128-\191]*", reverse))
while pos < len do
end
pos = pos + 1

b = pattern:byte(pos)
do
if escape > 0 then escape = escape - 1 end
local function err(cp)
if balanced > 0 then balanced = balanced - 1 end
error("Codepoint " .. cp .. " is out of range: codepoints must be between 0x0 and 0x10FFFF.", 2)
char = next_char or pattern:sub(pos, pos)
end
next_char = pattern:sub(pos + 1, pos + 1)

if escape == 0 then
local function utf8_char(cp)
if char == "%" then
cp = tonumber(cp)
if cp < 0 then
-- Apply % escape.
if next_char == "." or next_char == "%" or next_char == "[" or next_char == "]" then
err("-0x" .. format("%X", -cp + 1))
escape = 2
elseif cp < 0x80 then
if balanced > 0 then balanced = balanced + 1 end
return char(cp)
-- These charsets make the pattern complex.
elseif cp < 0x800 then
elseif next_char:match("[acdlpsuwxACDLPSUWXZ]") then
return char(
return pattern, false
0xC0 + cp / 0x40,
0x80 + cp % 0x40
-- This is "%b".
elseif next_char == "b" then
)
balanced = 4
elseif cp < 0x10000 then
end
if cp >= 0xD800 and cp < 0xE000 then
-- Enter or leave a charset.
return "?" -- mw.ustring.char returns "?" for surrogates.
elseif char == "[" then
set = true
elseif char == "]" then
set = false
elseif char == "(" then
capture = capture + 1
elseif char == ")" then
if capture > 0 and set == false and balanced == 0 then
captures = captures + 1
capture = capture - 1
end
end
end
return char(
0xE0 + cp / 0x1000,
0x80 + cp / 0x40 % 0x40,
0x80 + cp % 0x40
)
elseif cp < 0x110000 then
return char(
0xF0 + cp / 0x40000,
0x80 + cp / 0x1000 % 0x40,
0x80 + cp / 0x40 % 0x40,
0x80 + cp % 0x40
)
end
end
err("0x" .. format("%X", cp))
-- Multibyte char.
end
if b > 0x7f then

-- If followed by "*", "+" or "-", then 2-byte chars can be converted into charsets. However, this is not possible with 3 or 4-byte chars, as the charset would be too permissive, because if the trailing bytes were in a different order then this could be a different valid character.
function export.char(cp, ...)
if ... == nil then
if next_char == "*" or next_char == "+" or next_char == "-" then
local prev_pos = pattern:byte(pos - 1)
return utf8_char(cp)
if prev_pos > 0xc1 and prev_pos < 0xe0 then
new_pattern[#new_pattern] = "[" .. new_pattern[#new_pattern]
table.insert(new_pattern, char .. "]")
else
return pattern, false
end
-- If in a charset or used in "%b", then the pattern is complex.
-- If followed by "?", add "?" after each byte.
elseif next_char == "?" then
table.insert(new_pattern, char .. "?")
local check_pos, check_b, i = pos, pattern:byte(pos), #new_pattern
while check_b and check_b < 0xc0 do
check_pos = check_pos - 1
check_b = pattern:byte(check_pos)
i = i - 1
new_pattern[i] = new_pattern[i] .. "?"
end
pos = pos + 1
next_char = pattern:sub(pos + 1, pos + 1)
elseif set or balanced > 0 then
return pattern, false
else
table.insert(new_pattern, char)
end
elseif char == "." then
-- "*", "+", "-" are always okay after ".", as they don't care how many bytes a char has.
if set or next_char == "*" or next_char == "+" or next_char == "-" or escape > 0 then
table.insert(new_pattern, char)
-- If followed by "?", make sure "?" is after the leading byte of the UTF-8 char pattern, then skip forward one.
elseif next_char == "?" then
table.insert(new_pattern, "[%z\1-\127\194-\244]?[\128-\191]*")
pos = pos + 1
next_char = pattern:sub(pos + 1, pos + 1)
-- If used with "%b", pattern is complex.
elseif balanced > 0 then
return pattern, false
-- Otherwise, add the UTF-8 char pattern.
else
table.insert(new_pattern, "[%z\1-\127\194-\244][\128-\191]*")
end
-- Negative charsets are always complex, unless the text has no UTF-8 chars.
elseif char == "[" and next_char == "^" and escape == 0 and text:match("[\128-\255]") then
return pattern, false
-- "()" matches the position unless escaped or used with "%b", so always necessitates ustring (as we need it to match the char position, not the byte one).
elseif char == "(" and next_char == ")" and balanced == 0 and escape == 0 and text:match("[\128-\255]") then
return pattern, false
else
table.insert(new_pattern, char)
end
end
local ret = {cp, ...}
for i = 1, select("#", cp, ...) do
ret[i] = utf8_char(ret[i])
end
return concat(ret)
end
end
if captures > 32 then
u = export.char
return pattern, false
else
pattern = table.concat(new_pattern)
return pattern, true
end
end
end


--[==[A version of len which uses string.len, but returns the same result as mw.ustring.len.]==]
do
local function get_codepoint(b1, b2, b3, b4)
function export.len(text)
text = tostring(text)
if b1 < 128 then
local len_bytes = text:len()
return b1, 1
if not text:match("[\128-\255]") then
elseif b1 < 224 then
return 0x40 * b1 + b2 - 0x3080, 2
return len_bytes
else
elseif b1 < 240 then
return 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080, 3
return iterate_utf8(text, len_bytes, "byte")
end
return 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080, 4
end
end
end


--[==[A version of sub which uses string.sub, but returns the same result as mw.ustring.sub.]==]
function export.codepoint(str, i, j)
function export.sub(text, i_char, j_char)
if type(str) == "number" then
text = tostring(text)
return byte(str, i, j)
if not text:match("[\128-\255]") then
end
return text:sub(i_char, j_char)
i, j = i or 1, j == -1 and #str or i or 1
if i == 1 and j == 1 then
return (get_codepoint(byte(str, 1, 4)))
elseif i < 0 or j < 0 then
return ucodepoint(str, i, j) -- FIXME
end
local n, nb, ret, nr = 0, 1, {}, 0
while n < j do
n = n + 1
if n < i then
local b = byte(str, nb)
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
else
local b1, b2, b3, b4 = byte(str, nb, nb + 3)
if not b1 then
break
end
nr = nr + 1
local add
ret[nr], add = get_codepoint(b1, b2, b3, b4)
nb = nb + add
end
end
return unpack(ret)
end
end
local i_byte, j_byte
codepoint = export.codepoint
if j_char then
if i_char > 0 and j_char > 0 then
function export.gcodepoint(str, i, j)
if j_char < i_char then return "" end
i, j = i or 1, j ~= -1 and j or nil
i_byte = iterate_utf8(text, i_char, "char")
if i < 0 or j and j < 0 then
j_byte = iterate_utf8(text, j_char + 1, "char", i_char, i_byte) - 1
return ugcodepoint(str, i, j) -- FIXME
elseif i_char < 0 and j_char < 0 then
end
if j_char < i_char then return "" end
local n, nb = 1, 1
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
while n < i do
i_byte = iterate_utf8(text, i_char, "char", j_char, j_byte)
local b = byte(str, nb)
-- For some reason, mw.ustring.sub with i=0, j=0 returns the same result as for i=1, j=1, while string.sub always returns "". However, mw.ustring.sub does return "" with i=1, j=0. As such, we need to adjust j_char to 1 if i_char is either 0, or negative with a magnitude greater than the length of the string.
if not b then
elseif j_char == 0 then
break
i_byte = iterate_utf8(text, i_char, "char")
end
if i_byte == 0 or -i_byte > text:len() then j_char = 1 end
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
n = n + 1
end
else
i_byte = iterate_utf8(text, i_char, "char")
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
return function()
if j and n > j then
return nil
end
n = n + 1
local b1, b2, b3, b4 = byte(str, nb, nb + 3)
if not b1 then
return nil
end
local ret, add = get_codepoint(b1, b2, b3, b4)
nb = nb + add
return ret
end
end
else
i_byte = iterate_utf8(text, i_char, "char")
end
end
return text:sub(i_byte, j_byte)
end
end


--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
function export.lower(str)
function export.lower(text)
text = tostring(text)
return (match(str, "^()[^\128-\255]*$") and lower or ulower)(str)
if not text:match("[\128-\255]") then
return text:lower()
else
return mw.ustring.lower(text)
end
end
end


--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
function export.upper(str)
function export.upper(text)
text = tostring(text)
return (match(str, "^()[^\128-\255]*$") and upper or uupper)(str)
if not text:match("[\128-\255]") then
return text:upper()
else
return mw.ustring.upper(text)
end
end
end


--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
do
local function add_captures(text, n, ...)
function export.find(text, pattern, init_char, plain)
text = tostring(text)
-- Insert any captures from the splitting pattern.
local simple
local offset, capture = n - 1, ...
pattern, simple = patternSimplifier(text, pattern, plain)
while capture do
-- If the pattern is simple but multibyte characters are present, then init_char needs to be converted into bytes for string.find to work properly, and the return values need to be converted back into chars.
n = n + 1
if simple then
text[n] = capture
if not text:match("[\128-\255]") then
capture = select(n - offset, ...)
return text:find(pattern, init_char, plain)
end
return n
end
local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...)
if not (loc1 and start <= str_len) then
-- If no match, or there is but we're past the end of the string
-- (which happens when the match is the empty string), then add
-- the final chunk and return.
n = n + 1
text[n] = _sub(str, start)
return
elseif loc2 < loc1 then
-- Special case: If we match the empty string, then include the
-- next character; this avoids an infinite loop, and makes
-- splitting by an empty string work the way mw.text.split() does
-- (including non-adjacent empty string matches with %f). If we
-- reach the end of the string this way, return immediately, so we
-- don't get a final empty string. If using the string library, we
-- need to make sure we advance by one UTF-8 character.
if _sub == sub then
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
end
n = n + 1
text[n] = _sub(str, start, loc1)
start = loc1 + 1
if start > str_len then
return ... and add_captures(text, n, ...) or n
end
else
else
local init_byte = init_char and iterate_utf8(text, init_char, "char")
-- Add chunk up to the current match.
local byte1, byte2, c1, c2, c3, c4, c5, c6, c7, c8, c9 = text:find(pattern, init_byte, plain)
n = n + 1
text[n] = _sub(str, start, loc1 - 1)
-- If string.find returned nil, then return nil.
start = loc2 + 1
if not (byte1 and byte2) then
end
return nil
return (... and add_captures(text, n, ...) or n), start
end
local function _split(str, pattern, str_len, _sub, _find, plain)
local text, n, start = {}, 0, 1
repeat
n, start = iterate(str, str_len, text, n, start, _sub, _find(str, pattern, start, plain))
until not start
return text
end
--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==]
function export.split(str, pattern, str_lib, plain)
if str_lib or plain then
return _split(str, pattern, #str, sub, find, plain)
end
local simple = pattern_simplifier(pattern)
if simple then
return _split(str, simple, #str, sub, find)
end
return _split(str, pattern, ulen(str), usub, ufind)
end
export.capturing_split = export.split -- To be removed.
end

do
-- TODO: merge this with export.split. Not clear how to do this while
-- maintaining the same level of performance, as gsplit is slower.
local function _split(str, pattern, str_len, _sub, _find, plain)
local start, final = 1
local function iter(loc1, loc2, ...)
-- If no match, return the final chunk.
if not loc1 then
final = true
return _sub(str, start)
end
end
-- Special case: If we match the empty string, then eat the
-- Get first return value. If we have a positive init_char, we can save resources by resuming at that point.
-- next character; this avoids an infinite loop, and makes
local char1, char2
-- splitting by the empty string work the way mw.text.gsplit() does
if (not init_char) or init_char > 0 then
-- (including non-adjacent empty string matches with %f). If we
char1 = iterate_utf8(text, byte1, "byte", init_byte, init_char)
-- reach the end of the string this way, set `final` to true, so we
-- don't get stuck matching the empty string at the end.
local chunk
if loc2 < loc1 then
-- If using the string library, we need to make sure we advance
-- by one UTF-8 character.
if _sub == sub then
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
end
chunk = _sub(str, start, loc1)
if loc1 >= str_len then
final = true
else
start = loc1 + 1
end
-- Eat chunk up to the current match.
else
else
chunk = _sub(str, start, loc1 - 1)
char1 = iterate_utf8(text, byte1, "byte")
start = loc2 + 1
end
end
return chunk, ...
-- If byte1 and byte2 are the same, don't bother running iterate_utf8 twice. Otherwise, resume iterate_utf8 from byte1 to find char2.
end
if byte1 == byte2 then
char2 = char1
return function()
else
if not final then
return iter(_find(str, pattern, start, plain))
char2 = iterate_utf8(text, byte2, "byte", byte1, char1)
end
end
return nil
return unpack{char1, char2, c1, c2, c3, c4, c5, c6, c7, c8, c9}
end
end
else
end
return mw.ustring.find(text, pattern, init_char, plain)
function export.gsplit(str, pattern, str_lib, plain)
if str_lib or plain then
return _split(str, pattern, #str, sub, find, plain)
end
local simple = pattern_simplifier(pattern)
if simple then
return _split(str, simple, #str, sub, find)
end
return _split(str, pattern, ulen(str), usub, ufind)
end
end
end
end


--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
function export.trim(str, charset)
function export.match(text, pattern, init)
if not charset then
text = tostring(text)
return match(str, "^()%s*$") and "" or match(str, "^%s*(.*%S)")
local simple
elseif match(charset, "^()[^\128-\255]*$") then
pattern, simple = patternSimplifier(text, pattern)
return match(str, "^()[" .. charset .. "]*$") and "" or match(str, "^[" .. charset .. "]*(.*[^" .. charset .. "])")
if simple then
if init and text:find("[\128-\255]") then
init = iterate_utf8(text, init, "char")
end
return text:match(pattern, init)
else
return mw.ustring.match(text, pattern, init)
end
end
return umatch(str, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$")
end
end


--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
do
function export.gmatch(text, pattern)
local entities
text = tostring(text)

local simple
local function decode_numeric_entity(code, pattern, base)
pattern, simple = patternSimplifier(text, pattern)
local cp = match(code, pattern) and tonumber(code, base)
if simple then
return cp and cp < 0x110000 and u(cp) or nil
return text:gmatch(pattern)
end
else

return mw.ustring.gmatch(text, pattern)
local function decode_entity(hash, x, code)
if hash == "#" then
return x == "" and decode_numeric_entity(code, "^%d+$") or
decode_numeric_entity(code, "^%x+$", 16)
end
entities = entities or load_data("Module:data/entities")
return entities[x .. code]
end

-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].
function export.decode_entities(str)
return find(str, "&", 1, true) and
gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
end
end
end
end


--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
do
function export.gsub(text, pattern, repl, n)
local html_entities
text = tostring(text)
local function encode_entity(ch)
local simple
pattern, simple = patternSimplifier(text, pattern)
local entity = html_entities[ch]
if entity then
if simple then
return entity
return text:gsub(pattern, repl, n)
else
end
return mw.ustring.gsub(text, pattern, repl, n)
entity = "&#" .. codepoint(ch) .. ";"
html_entities[ch] = entity
return entity
end
function export.encode_entities(str, charset, str_lib, plain)
-- Memoized HTML entities (taken from mw.text.lua).
html_entities = html_entities or {
["\""] = "&quot;",
["&"] = "&amp;",
["'"] = "&#039;",
["<"] = "&lt;",
[">"] = "&gt;",
["\194\160"] = "&nbsp;",
}
if not charset then
return (gsub(str, "[\"&'<>\194]\160?", html_entities))
elseif plain then
return (gsub(str, "[" .. charset_escape(charset) .. "]", encode_entity))
elseif str_lib then
if not match(charset, "^()[^\128-\255]*$") then
error("Cannot use the string library with a character set that contains a character with a codepoint above U+007F.")
end
return (gsub(str, "[" .. charset .. "]", encode_entity))
end
local pattern = charset and "[" .. charset .. "]"
local simple = pattern_simplifier(pattern)
if simple then
return (gsub(str, simple, encode_entity))
end
return (ugsub(str, pattern, encode_entity))
end
end
end
end


--[==[
do
-- Reimplementation of mw.ustring.split() that includes any capturing
local function decode_path(code)
-- groups in the splitting pattern. This works like Python's re.split()
return char(tonumber(code, 16))
-- function, except that it has Lua's behavior when the split pattern
end
-- is empty (i.e. advancing by one character at a time; Python returns the
-- whole remainder of the string).
local function decode(lead, trail)
]==]
if lead == "+" or lead == "_" then
function export.capturing_split(str, pattern)
return " " .. trail
local ret = {}
elseif #trail == 2 then
-- (.-) corresponds to (.*?) in Python or Perl; () captures the
return decode_path(trail)
-- current position after matching.
pattern = "(.-)" .. pattern .. "()"
local start = 1
while true do
-- Did we reach the end of the string?
if start > #str then
table.insert(ret, "")
return ret
end
end
-- match() returns all captures as multiple return values;
return lead .. trail
-- we need to insert into a table to get them all.
end
local captures = {export.match(str, pattern, start)}
-- If no match, add the remainder of the string.
function export.decode_uri(str, enctype)
if #captures == 0 then
enctype = enctype and upper(enctype) or "QUERY"
table.insert(ret, export.sub(str, start))
if enctype == "PATH" then
return find(str, "%", 1, true) and
return ret
gsub(str, "%%(%x%x)", decode_path) or str
elseif enctype == "QUERY" then
return (find(str, "%", 1, true) or find(str, "+", 1, true)) and
gsub(str, "([%%%+])(%x?%x?)", decode) or str
elseif enctype == "WIKI" then
return (find(str, "%", 1, true) or find(str, "_", 1, true)) and
gsub(str, "([%%_])(%x?%x?)", decode) or str
end
error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)
end
end

do
local function _remove_comments(str, pre)
local head = find(str, "<!--", 1, true)
if not head then
return str
end
end
local ret, n = {sub(str, 1, head - 1)}, 1
local newstart = table.remove(captures)
-- Special case: If we don't advance by any characters, then advance
while true do
-- by one character; this avoids an infinite loop, and makes splitting
local loc = find(str, "-->", head + 4, true)
-- by an empty string work the way mw.ustring.split() does. If we
if not loc then
-- reach the end of the string this way, return immediately, so we
return pre and concat(ret) or
-- don't get a final empty string.
concat(ret) .. sub(str, head)
if newstart == start then
table.insert(ret, export.sub(str, start, start))
table.remove(captures, 1)
start = start + 1
if start > #str then
return ret
end
end
else
head = loc + 3
table.insert(ret, table.remove(captures, 1))
loc = find(str, "<!--", head, true)
if not loc then
start = newstart
return concat(ret) .. sub(str, head)
end
n = n + 1
ret[n] = sub(str, head, loc - 1)
head = loc
end
end
-- Insert any captures from the splitting pattern.
end
for _, x in ipairs(captures) do
table.insert(ret, x)
--[==[Removes any HTML comments from the input text. `stage` can be one of three options:
* {{lua|"PRE"}} (default) applies the method used by MediaWiki's preprocessor: all {{code||<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code||<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
* {{lua|"POST"}} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code||<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code||<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code||<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {{lua|"PRE"}} method will have already been applied by the native parser.
* {{lua|"BOTH"}} applies {{lua|"PRE"}} then {{lua|"POST"}}.]==]
function export.remove_comments(str, stage)
if not stage or stage == "PRE" then
return _remove_comments(str, true)
end
local processed = stage == "POST" and _remove_comments(str) or
stage == "BOTH" and _remove_comments(str, true) or
error("bad argument #2 to \"remove_comments\" (expected PRE, POST, or BOTH)", 2)
while processed ~= str do
str = processed
processed = _remove_comments(str)
end
end
return str
end
end
end
end


local function uclcfirst(text, dolower)
--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{code|lua|"\0"}}, {{code|lua|"\t"}}, {{code|lua|"\n"}}, {{code|lua|"\v"}}, {{code|lua|"\r"}} and {{code|lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
function export.php_trim(str)
local function douclcfirst(text)
-- Actual function to re-case of the first letter.
return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""
local first_letter = export.sub(text, 1, 1)
end
first_letter = dolower and export.lower(first_letter) or export.upper(first_letter)
php_trim = export.php_trim
return first_letter .. export.sub(text, 2)

--[==[Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{code|lua|frame.args}} table). For example, {{code|lua|"1"}} is normalized to {{code|lua|1}} (a number), and {{code|lua|" foo "}} is normalized to {{code|lua|"foo"}}. If the input is not a string, it is returned unchanged.

After being trimmed with {{code|lua|export.php_trim}}, strings are converted to numbers if:
# They are integers, with no decimals (2.0) or leading zeroes (02).
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
# For positive values, they do not have a leading {{code|lua|+}} sign.]==]
function export.scribunto_param_key(key)
if type(key) ~= "string" then
return key
end
end
-- If there's a link at the beginning, re-case the first letter of the
key = php_trim(key)
-- link text. This pattern matches both piped and unpiped links.
if match(key, "^-?[1-9]%d*$") then
-- If the link is not piped, the second capture (linktext) will be empty.
local num = tonumber(key)
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true.
if link then
return (
return "[[" .. link .. "|" .. douclcfirst(linktext ~= "" and linktext or link) .. "]]" .. remainder
num <= 9007199254740991 and num >= -9007199254740991 or
key == "9007199254740992" or
key == "-9007199254740992"
) and num or key
elseif key == "0" then
return 0
end
end
return key
return douclcfirst(text)
end
end


function export.ucfirst(text)
do
return uclcfirst(text, false)
local byte_escapes
local function escape_byte(b)
return byte_escapes[b] or format("\\%03d", byte(b))
end
function export.escape_bytes(str)
byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes
return (gsub(str, ".", escape_byte))
end
end
end


function export.format_fun(str, fun)
function export.lcfirst(text)
return uclcfirst(text, true)
return (gsub(str, "{(\\?)((\\?)[^{}]*)}", function(p1, name, p2)
if #p1 + #p2 == 1 then
return name == "op" and "{" or
name == "cl" and "}" or
error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
elseif fun(name) and type(fun(name)) ~= "string" then
error(module_name .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
end
return fun(name) or error(module_name .. ".format: \"" .. name .. "\" not found in table")
end))
end
format_fun = export.format_fun

--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
====Examples====
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
return format_fun(str, function(key)
return tbl[key]
end)
end
end


-- Faster version of mw.text.nowiki, with minor changes to match the PHP equivalent: ";" always escapes, and colons in unslashed protocols only escape after regex \b.
do
do
local function do_uclcfirst(str, case_func)
local function escape_char(str1, str2)
if str2 then
-- Actual function to re-case of the first letter.
return str1 .. "&#" .. str2:byte() .. ";"
local first_letter = case_func(match(str, "^.[\128-\191]*") or "")
end
return first_letter .. sub(str, #first_letter + 1)
return "&#" .. str1:byte() .. ";"
end
end
local function uclcfirst(str, case_func)
local function escape_uri(uri)
local uri_schemes = mw.loadData("Module:string utilities/data").uri_schemes
-- If there's a link at the beginning, re-case the first letter of the
return uri_schemes[uri:lower()] and uri .. "&#58;" or uri .. ":"
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
if link then
return "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
end
return do_uclcfirst(str, case_func)
end
end
function export.ucfirst(str)
function export.nowiki(text)
return uclcfirst(str, uupper)
return (text
:gsub("[\"&';<=>%[%]{|}]", escape_char)
:gsub("^[\t\n\r #%*:]", escape_char)
:gsub("([\n\r])([\t\n\r #%*:])", escape_char)
:gsub("%f[^%z\r\n]%-(%-%-%-)", "&#45;%1")
:gsub("__", "_&#95;")
:gsub("://", "&#58;//")
:gsub("(ISBN)(%s)", escape_char)
:gsub("(PMID)(%s)", escape_char)
:gsub("(RFC)(%s)", escape_char)
:gsub("([%w_]+):", escape_uri))
end
end
end


function export.lcfirst(str)
function export.capitalize(text)
if type(text) == "table" then
return uclcfirst(str, ulower)
-- allow calling from a template
text = text.args[1]
end
end
-- Capitalize multi-word that is separated by spaces
-- by uppercasing the first letter of each part.
local function capitalize(w)
-- I assume nobody will input all CAP text.
return uclcfirst(w, uupper)
w2 = {}
end
for w in export.gmatch(text, "%S+") do
table.insert(w2, uclcfirst(w, false))
--[==[Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.]==]
function export.capitalize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Capitalize multi-word that is separated by spaces
-- by uppercasing the first letter of each part.
-- I assume nobody will input all CAP text.
return (ugsub(str, "%S+", capitalize))
end
end
return table.concat(w2, " ")
end
end


function export.pluralize(text)
do
if type(text) == "table" then
local function word_ends_in_consonant_plus_y(str)
-- allow calling from a template
text = text.args[1]
end
-- Pluralize a word in a smart fashion, according to normal English rules.
-- 1. If word ends in consonant + -y, replace the -y with -ies.
-- 2. If the word ends in -s, -x, -z, -sh, -ch, add -es.
-- 3. Otherwise, add -s.
-- This handles links correctly:
-- 1. If a piped link, change the second part appropriately.
-- 2. If a non-piped link and rule #1 above applies, convert to a piped link
-- with the second part containing the plural.
-- 3. If a non-piped link and rules #2 or #3 above apply, add the plural
-- outside the link.
local function word_ends_in_consonant_plus_y(text)
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
-- apply to proper nouns, hence "the Gettys", "the public Ivys".
-- apply to proper nouns, hence "the Gettys", "the public Ivys".
ထႅဝ် 964: ထႅဝ် 605:
-- be important as this function is almost always called on common nouns
-- be important as this function is almost always called on common nouns
-- (e.g. parts of speech, place types).
-- (e.g. parts of speech, place types).
return find(str, "[^aeiouAEIOU ]y$")
return text:find("[^aeiouAEIOU ]y$")
end
end
local function word_takes_es_plural(str)
local function word_takes_es_plural(text)
return find(str, "[sxz]$") or find(str, "[cs]h$")
return text:find("[sxz]$") or text:find("[cs]h$")
end
end
local function do_pluralize(str)
local function do_pluralize(text)
if word_ends_in_consonant_plus_y(str) then
if word_ends_in_consonant_plus_y(text) then
-- avoid returning multiple values
-- avoid returning multiple values
return (gsub(str, "y$", "ies"))
local hack_single_retval = text:gsub("y$", "ies")
return hack_single_retval
elseif word_takes_es_plural(str) then
elseif word_takes_es_plural(text) then
return str .. "es"
return text .. "es"
else
return text .. "s"
end
end
end
return str .. "s"
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
--[==[
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
Pluralize a word in a smart fashion, according to normal English rules.
if link then
# If word ends in consonant + -y, replace the -y with -ies.
if linktext ~= "" then
# If the word ends in -s, -x, -z, -sh, -ch, add -es.
# Otherwise, add -s.

This handles links correctly:
# If a piped link, change the second part appropriately.
# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
]==]
function export.pluralize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
if not link then
return do_pluralize(str)
elseif linktext ~= "" then
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
end
elseif word_ends_in_consonant_plus_y(link) then
if word_ends_in_consonant_plus_y(link) then
return beginning .. "[[" .. link .. "|" .. gsub(link, "y$", "ies") .. "]]"
return beginning .. "[[" .. link .. "|" .. link:gsub("y$", "ies") .. "]]"
end
end
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
end
end
return do_pluralize(text)
end
end


function export.singularize(text)
do
if type(text) == "table" then
local function do_singularize(str)
-- allow calling from a template
local sing = match(str, "^(.-)ies$")
text = text.args[1]
end
-- Singularize a word in a smart fashion, according to normal English rules.
-- Works analogously to pluralize().
-- NOTE: This doesn't always work as well as pluralize(). Beware. It will
-- mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
-- 1. If word ends in -ies, replace -ies with -y.
-- 2. If the word ends in -xes, -shes, -ches, remove -es. [Does not affect
-- -ses, cf. "houses", "impasses".]
-- 3. Otherwise, remove -s.
-- This handles links correctly:
-- 1. If a piped link, change the second part appropriately. Collapse the
-- link to a simple link if both parts end up the same.
-- 2. If a non-piped link, singularize the link.
-- 3. A link like "[[parish]]es" will be handled correctly because the
-- code that checks for -shes etc. allows ] characters between the
-- 'sh' etc. and final -es.
local function do_singularize(text)
local sing = text:match("^(.-)ies$")
if sing then
if sing then
return sing .. "y"
return sing .. "y"
end
end
-- Handle cases like "[[parish]]es"
-- Handle cases like "[[parish]]es"
return match(str, "^(.-[sc]h%]*)es$") or
local sing = text:match("^(.-[sc]h%]*)es$")
if sing then
return sing
end
-- Handle cases like "[[box]]es"
-- Handle cases like "[[box]]es"
match(str, "^(.-x%]*)es$") or
local sing = text:match("^(.-x%]*)es$")
if sing then
-- Handle regular plurals
return sing
match(str, "^(.-)s$") or
end
-- Otherwise, return input
local sing = text:match("^(.-)s$")
str
if sing then
return sing
end
return text
end
end

local function collapse_link(link, linktext)
local function collapse_link(link, linktext)
if link == linktext then
if link == linktext then
return "[[" .. link .. "]]"
return "[[" .. link .. "]]"
else
return "[[" .. link .. "|" .. linktext .. "]]"
end
end
return "[[" .. link .. "|" .. linktext .. "]]"
end
end
--[==[
Singularize a word in a smart fashion, according to normal English rules. Works analogously to {pluralize()}.


-- Check for a link. This pattern matches both piped and unpiped links.
'''NOTE''': This doesn't always work as well as {pluralize()}. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
-- If the link is not piped, the second capture (linktext) will be empty.
# If word ends in -ies, replace -ies with -y.
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
if link then
# Otherwise, remove -s.
if linktext ~= "" then

This handles links correctly:
# If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
# If a non-piped link, singularize the link.
# A link like "[[parish]]es" will be handled correctly because the code that checks for -shes etc. allows ] characters between the
'sh' etc. and final -es.
]==]
function export.singularize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
if not link then
return do_singularize(str)
elseif linktext ~= "" then
return beginning .. collapse_link(link, do_singularize(linktext))
return beginning .. collapse_link(link, do_singularize(linktext))
end
end
return beginning .. "[[" .. do_singularize(link) .. "]]"
return beginning .. "[[" .. do_singularize(link) .. "]]"
end
end

return do_singularize(text)
end
end



--[==[
function export.add_indefinite_article(text, uppercase)
Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text.
Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with
a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.get_indefinite_article(str, ucfirst)
str = str or ""
local is_vowel = false
local is_vowel = false
-- If there's a link at the beginning, examine the first letter of the
-- If there's a link at the beginning, examine the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
if link then
if link then
is_vowel = find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
is_vowel = export.find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
else
else
is_vowel = find(str, "^[AEIOUaeiou]")
is_vowel = export.find(text, "^[AEIOUaeiou]")
end
end
return is_vowel and (ucfirst and "An" or "an") or (ucfirst and "A" or "a")
return (is_vowel and (uppercase and "An " or "an ") or (uppercase and "A " or "a ")) .. text
end
end
get_indefinite_article = export.get_indefinite_article


-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
--[==[
function export.escape_risky_characters(text)
Prefix `text` with the appropriate indefinite article to prefix to `text`. Correctly handles links and capitalized
-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software.
text. Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning
if not mw.ustring.match(text, "%S") then
with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
return mw.text.encode(text, "%s")
]==]
else
function export.add_indefinite_article(text, ucfirst)
return mw.text.encode(text, "!#%%&*+/:;<=>?@[\\%]_{|}")
return get_indefinite_article(text, ucfirst) .. " " .. text
end
end
end



ၶိုၼ်းၶူၼ်ႉၶႆႈၼင်ႇ 16:33, 26 မေႇ 2024

Provides some utility functions for manipulating strings.

Functions

export.format_fun

function export.format_fun(str, fun)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.format

function export.format(str, tbl)

This function, unlike Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value). and Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value)., takes just two parameters—a format string and a table—and replaces all instances of Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value).} in the format string with the table's entry for Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value).. The opening and closing brace characters can be escaped with \op and \cl, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.

Examples

  • Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value).
    produces: Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value).
  • Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value).
    produces: Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value).
    • Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.

export.explode_utf8

function export.explode_utf8(str)

Explodes a string into an array of UTF8 characters. Warning: this function has no safety checks for non-UTF8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.

export.charsToBytes

function export.charsToBytes(text, pos)

Converts a character position to the equivalent byte position.

export.bytesToChars

function export.bytesToChars(text, pos)

Converts a byte position to the equivalent character position.

export.len

function export.len(text)

A version of len which uses string.len, but returns the same result as mw.ustring.len.

export.sub

function export.sub(text, i_char, j_char)

A version of sub which uses string.sub, but returns the same result as mw.ustring.sub.

export.lower

function export.lower(text)

A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.

export.upper

function export.upper(text)

A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.

export.find

function export.find(text, pattern, init_char, plain)

A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.

export.match

function export.match(text, pattern, init)

A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.

export.gmatch

function export.gmatch(text, pattern)

A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.

export.gsub

function export.gsub(text, pattern, repl, n)

A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.

export.capturing_split

function export.capturing_split(str, pattern)

-- Reimplementation of mw.ustring.split() that includes any capturing

-- groups in the splitting pattern. This works like Python's re.split()

-- function, except that it has Lua's behavior when the split pattern

-- is empty (i.e. advancing by one character at a time; Python returns the

-- whole remainder of the string).

export.ucfirst

function export.ucfirst(text)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.lcfirst

function export.lcfirst(text)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.nowiki

function export.nowiki(text)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.capitalize

function export.capitalize(text)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.pluralize

function export.pluralize(text)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.singularize

function export.singularize(text)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.add_indefinite_article

function export.add_indefinite_article(text, uppercase)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.

export.escape_risky_characters

function export.escape_risky_characters(text)

This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.


local module_name = "string_utilities"
local export = {}

local format_escapes = {
	["op"] = "{",
	["cl"] = "}",
}

function export.format_fun(str, fun)
	return (str:gsub("{(\\?)((\\?)[^{}]*)}", function (p1, name, p2)
		if #p1 + #p2 == 1 then
			return format_escapes[name] or error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
		else
			if fun(name) and type(fun(name)) ~= "string" then
				error(module_name .. ".format: '" .. name .. "' is a " .. type(fun(name)) .. ", not a string")
			end
			return fun(name) or error(module_name .. ".format: '" .. name .. "' not found in table")
		end
	end))
end

--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
====Examples====
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
	return export.format_fun(str, function (key) return tbl[key] end)
end

--[==[Explodes a string into an array of UTF8 characters. '''Warning''': this function has no safety checks for non-UTF8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
function export.explode_utf8(str)
	local byte = string.byte
	local sub = string.sub
	
	local str_len = #str
	local text = {}
	local n, i, b = 1, 0
	
	while n <= str_len do
		b = byte(str, n)
		i = i + 1
		if b < 0xC0 then
			text[i] = sub(str, n, n)
			n = n + 1
		elseif b < 0xE0 then
			text[i] = sub(str, n, n + 1)
			n = n + 2
		elseif b < 0xF0 then
			text[i] = sub(str, n, n + 2)
			n = n + 3
		else
			text[i] = sub(str, n, n + 3)
			n = n + 4
		end
	end
	
	return text
end

-- A helper function which takes a string, position and type ("byte" or "char"), and returns the equivalent position for the other type (e.g. iterate_utf8("字典", 2, "char") returns 4, because character 2 of "字典" begins with byte 4). `pos` can be positive or negative, and the function will iterate over the string forwards or backwards (respectively) until it reaches the input position. Checks byte-by-byte; skipping over trailing bytes, and then calculating the correct byte trail for any leading bytes (i.e. how many trailing bytes should follow); these trailing bytes are then checked together.
-- The optional parameters `init_from_type` and `init_to_type` can be used to start part-way through an iteration to improve performance, if multiple values need to be returned from the same string. For example, iterate_utf8("слова́рь", 11, "byte", 5, 3) will begin checking at byte 5/the start of character 3. Note: The function won't check if these values match each other (as the only way to do this would be to run the iteration from the beginning), so mismatched values will return incorrect results.
local function iterate_utf8(text, pos, from_type, init_from_type, init_to_type)
	-- Position 0 is always valid and never changes.
	if pos == 0 then
		return pos
	end
	
	local to_type
	if from_type == "char" then
		to_type = "byte"
	else
		to_type = "char"
	end
	
	-- Positive positions iterate forwards; negative positions iterate backwards.
	local iterate_val
	if pos > 0 then
		iterate_val = 1
	else
		iterate_val = -1
	end
	
	-- Adjust init_from_type and init_to_type to the iteration before, so that matches for the position given by them will work.
	local trail, cp, min, b = 0
	local c, leading_byte = {}
	c[from_type] = init_from_type and init_from_type ~= 0 and init_from_type - iterate_val or 0
	c[to_type] = init_to_type and init_to_type ~= 0 and init_to_type - iterate_val or 0
	
	while true do
		if pos > 0 then
			b = text:byte(c.byte + 1)
		else
			b = text:byte(text:len() + c.byte)
		end
		-- Position byte doesn't exist, so iterate the return value and return it.
		if not b then
			return c[to_type] + iterate_val
		elseif b < 0x80 then
			-- 1-byte codepoint, 00-7F.
			trail = 0
			cp = b
			min = 0
			leading_byte = true
		elseif b < 0xc0 then
			-- A trailing byte.
			leading_byte = false
		elseif b < 0xc2 then
			-- An overlong encoding for a 1-byte codepoint.
			error("String " .. text .. " is not UTF-8.")
		elseif b < 0xe0 then
			-- 2-byte codepoint, C2-DF.
			trail = 1
			cp = b - 0xc0
			min = 0x80
			leading_byte = true
		elseif b < 0xf0 then
			-- 3-byte codepoint, E0-EF.
			trail = 2
			cp = b - 0xe0
			min = 0x800
			leading_byte = true
		elseif b < 0xf4 then
			-- 4-byte codepoint, F0-F3.
			trail = 3
			cp = b - 0xf0
			min = 0x10000
			leading_byte = true
		elseif b == 0xf4 then
			-- 4-byte codepoint, F4.
			-- Make sure it doesn't decode to over U+10FFFF.
			if text:byte(c.byte + 2) > 0x8f then
				error("String " .. text .. " is not UTF-8.")
			end
			trail = 3
			cp = 4
			min = 0x100000
			leading_byte = true
		else
			-- Codepoint over U+10FFFF, or invalid byte.
			error("String " .. text .. " is not UTF-8.")
		end
		
		-- Check subsequent bytes for multibyte codepoints.
		if leading_byte then
			local from, to
			if pos > 0 then
				from, to = c.byte + 2, c.byte + 1 + trail
			else
				from, to = text:len() + c.byte + 1, text:len() + c.byte + trail
			end
			for trailing_byte = from, to do
				b = text:byte(trailing_byte)
				if not b or b < 0x80 or b > 0xbf then
					error("String " .. text .. " is not UTF-8.")
				end
				cp = cp * 0x40 + b - 0x80
			end
			local next_byte = text:byte(to + 1)
			if next_byte and next_byte >= 0x80 and next_byte <= 0xbf then
				-- Too many trailing bytes.
				error("String " .. text .. " is not UTF-8.")
			elseif cp < min then
				-- Overlong encoding.
				error("String " .. text .. " is not UTF-8.")
			end
		end
		
		c.byte = c.byte + iterate_val
		if leading_byte then
			c.char = c.char + iterate_val
		end
		
		if c[from_type] == pos then
			return c[to_type]
		end
	end
end

--[==[Converts a character position to the equivalent byte position.]==]
function export.charsToBytes(text, pos)
	return iterate_utf8(text, pos, "char")
end

--[==[Converts a byte position to the equivalent character position.]==]
function export.bytesToChars(text, pos)
	local byte = text:byte(pos)
	if byte and byte >= 0x80 and byte <= 0xbf then
		error("Byte " .. pos .. " is not a leading byte.")
	end
	return iterate_utf8(text, pos, "byte")
end

-- A helper function which iterates through a pattern, and returns two values: a potentially modified version of the pattern, and a boolean indicating whether the returned pattern is simple (i.e. whether it can be used with the stock string library); if not, then the pattern is complex (i.e. it must be used with the ustring library, which is much more resource-intensive).
local function patternSimplifier(text, pattern, plain)
	pattern = tostring(pattern)
	-- If `plain` is set, then the pattern is treated as literal (so is always simple). Only used by find.
	if plain then
		return pattern, true
	--If none of these are present, then the pattern has to be simple.
	elseif not (
		pattern:match("%[.-[\128-\255].-%]") or
		pattern:match("[\128-\255][%*%+%?%-]") or
		pattern:match("%%[abcdlpsuwxACDLPSUWXZ]") or
		pattern:match("%[%^[^%]]+%]") or
		pattern:match("%.[^%*%+%-]") or
		pattern:match("%.$") or
		pattern:match("%%b.?[\128-\255]") or
		pattern:match("()", 1, true)
	) then
		return pattern, true
	end
	-- Otherwise, the pattern could go either way.
	-- Build up the new pattern in a table, then concatenate at the end. we do it this way, as occasionally entries get modified along the way.
	local new_pattern = {}
	local len, pos, b = pattern:len(), 0
	local char, next_char
	
	-- `escape` and `balanced` are counters, which ensure the effects of % or %b (respectively) are distributed over the following bytes.
	-- `set` is a boolean that states whether the current byte is in a charset.
	-- `capture` keeps track of how many layers of capture groups the position is in, while `captures` keeps a tally of how many groups have been detected (due to the string library limit of 32).
	local escape, set, balanced, capture, captures = 0, false, 0, 0, 0
	
	while pos < len do
		pos = pos + 1
		b = pattern:byte(pos)
		if escape > 0 then escape = escape - 1 end
		if balanced > 0 then balanced = balanced - 1 end
		char = next_char or pattern:sub(pos, pos)
		next_char = pattern:sub(pos + 1, pos + 1)
		if escape == 0 then
			if char == "%" then
				-- Apply % escape.
				if next_char == "." or next_char == "%" or next_char == "[" or next_char == "]" then
					escape = 2
					if balanced > 0 then balanced = balanced + 1 end
				-- These charsets make the pattern complex.
				elseif next_char:match("[acdlpsuwxACDLPSUWXZ]") then
					return pattern, false
				-- This is "%b".
				elseif next_char == "b" then
					balanced = 4
				end
			-- Enter or leave a charset.
			elseif char == "[" then
				set = true
			elseif char == "]" then
				set = false
			elseif char == "(" then
				capture = capture + 1
			elseif char == ")" then
				if capture > 0 and set == false and balanced == 0 then
					captures = captures + 1
					capture = capture - 1
				end
			end
		end
		
		-- Multibyte char.
		if b > 0x7f then
			-- If followed by "*", "+" or "-", then 2-byte chars can be converted into charsets. However, this is not possible with 3 or 4-byte chars, as the charset would be too permissive, because if the trailing bytes were in a different order then this could be a different valid character.
			if next_char == "*" or next_char == "+" or next_char == "-" then
				local prev_pos = pattern:byte(pos - 1)
				if prev_pos > 0xc1 and prev_pos < 0xe0 then
					new_pattern[#new_pattern] = "[" .. new_pattern[#new_pattern]
					table.insert(new_pattern, char .. "]")
				else
					return pattern, false
				end
			-- If in a charset or used in "%b", then the pattern is complex.
			-- If followed by "?", add "?" after each byte.
			elseif next_char == "?" then
				table.insert(new_pattern, char .. "?")
				local check_pos, check_b, i = pos, pattern:byte(pos), #new_pattern
				while check_b and check_b < 0xc0 do
					check_pos = check_pos - 1
					check_b = pattern:byte(check_pos)
					i = i - 1
					new_pattern[i] = new_pattern[i] .. "?"
				end
				pos = pos + 1
				next_char = pattern:sub(pos + 1, pos + 1)
			elseif set or balanced > 0 then
				return pattern, false
			else
				table.insert(new_pattern, char)
			end
		elseif char == "." then
			-- "*", "+", "-" are always okay after ".", as they don't care how many bytes a char has.
			if set or next_char == "*" or next_char == "+" or next_char == "-" or escape > 0 then
				table.insert(new_pattern, char)
			-- If followed by "?", make sure "?" is after the leading byte of the UTF-8 char pattern, then skip forward one.
			elseif next_char == "?" then
				table.insert(new_pattern, "[%z\1-\127\194-\244]?[\128-\191]*")
				pos = pos + 1
				next_char = pattern:sub(pos + 1, pos + 1)
			-- If used with "%b", pattern is complex.
			elseif balanced > 0 then
				return pattern, false
			-- Otherwise, add the UTF-8 char pattern.
			else
				table.insert(new_pattern, "[%z\1-\127\194-\244][\128-\191]*")
			end
		-- Negative charsets are always complex, unless the text has no UTF-8 chars.
		elseif char == "[" and next_char == "^" and escape == 0 and text:match("[\128-\255]") then
			return pattern, false
		-- "()" matches the position unless escaped or used with "%b", so always necessitates ustring (as we need it to match the char position, not the byte one).
		elseif char == "(" and next_char == ")" and balanced == 0 and escape == 0 and text:match("[\128-\255]") then
			return pattern, false
		else
			table.insert(new_pattern, char)
		end
	end
	if captures > 32 then
		return pattern, false
	else
		pattern = table.concat(new_pattern)
		return pattern, true
	end
end

--[==[A version of len which uses string.len, but returns the same result as mw.ustring.len.]==]
function export.len(text)
	text = tostring(text)
	local len_bytes = text:len()
	if not text:match("[\128-\255]") then
		return len_bytes
	else
		return iterate_utf8(text, len_bytes, "byte")
	end
end

--[==[A version of sub which uses string.sub, but returns the same result as mw.ustring.sub.]==]
function export.sub(text, i_char, j_char)
	text = tostring(text)
	if not text:match("[\128-\255]") then
		return text:sub(i_char, j_char)
	end
	local i_byte, j_byte
	if j_char then
		if i_char > 0 and j_char > 0 then
			if j_char < i_char then return "" end
			i_byte = iterate_utf8(text, i_char, "char")
			j_byte = iterate_utf8(text, j_char + 1, "char", i_char, i_byte) - 1
		elseif i_char < 0 and j_char < 0 then
			if j_char < i_char then return "" end
			j_byte = iterate_utf8(text, j_char + 1, "char") - 1
			i_byte = iterate_utf8(text, i_char, "char", j_char, j_byte)
		-- For some reason, mw.ustring.sub with i=0, j=0 returns the same result as for i=1, j=1, while string.sub always returns "". However, mw.ustring.sub does return "" with i=1, j=0. As such, we need to adjust j_char to 1 if i_char is either 0, or negative with a magnitude greater than the length of the string.
		elseif j_char == 0 then
			i_byte = iterate_utf8(text, i_char, "char")
			if i_byte == 0 or -i_byte > text:len() then j_char = 1 end
			j_byte = iterate_utf8(text, j_char + 1, "char") - 1
		else
			i_byte = iterate_utf8(text, i_char, "char")
			j_byte = iterate_utf8(text, j_char + 1, "char") - 1
		end
	else
		i_byte = iterate_utf8(text, i_char, "char")
	end
	return text:sub(i_byte, j_byte)
end

--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
function export.lower(text)
	text = tostring(text)
	if not text:match("[\128-\255]") then
		return text:lower()
	else
		return mw.ustring.lower(text)
	end
end

--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
function export.upper(text)
	text = tostring(text)
	if not text:match("[\128-\255]") then
		return text:upper()
	else
		return mw.ustring.upper(text)
	end
end

--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
function export.find(text, pattern, init_char, plain)
	text = tostring(text)
	local simple
	pattern, simple = patternSimplifier(text, pattern, plain)
	-- If the pattern is simple but multibyte characters are present, then init_char needs to be converted into bytes for string.find to work properly, and the return values need to be converted back into chars.
	if simple then
		if not text:match("[\128-\255]") then
			return text:find(pattern, init_char, plain)
		else
			local init_byte = init_char and iterate_utf8(text, init_char, "char")
			local byte1, byte2, c1, c2, c3, c4, c5, c6, c7, c8, c9 = text:find(pattern, init_byte, plain)
			
			-- If string.find returned nil, then return nil.
			if not (byte1 and byte2) then
				return nil
			end
			
			-- Get first return value. If we have a positive init_char, we can save resources by resuming at that point.
			local char1, char2
			if (not init_char) or init_char > 0 then
				char1 = iterate_utf8(text, byte1, "byte", init_byte, init_char)
			else
				char1 = iterate_utf8(text, byte1, "byte")
			end
			
			-- If byte1 and byte2 are the same, don't bother running iterate_utf8 twice. Otherwise, resume iterate_utf8 from byte1 to find char2.
			if byte1 == byte2 then
				char2 = char1
			else
				char2 = iterate_utf8(text, byte2, "byte", byte1, char1)
			end
			
			return unpack{char1, char2, c1, c2, c3, c4, c5, c6, c7, c8, c9}
		end
	else
		return mw.ustring.find(text, pattern, init_char, plain)
	end
end

--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
function export.match(text, pattern, init)
	text = tostring(text)
	local simple
	pattern, simple = patternSimplifier(text, pattern)
	if simple then
		if init and text:find("[\128-\255]") then
			init = iterate_utf8(text, init, "char")
		end
		return text:match(pattern, init)
	else
		return mw.ustring.match(text, pattern, init)
	end
end

--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
function export.gmatch(text, pattern)
	text = tostring(text)
	local simple
	pattern, simple = patternSimplifier(text, pattern)
	if simple then
		return text:gmatch(pattern)
	else
		return mw.ustring.gmatch(text, pattern)
	end
end

--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
function export.gsub(text, pattern, repl, n)
	text = tostring(text)
	local simple
	pattern, simple = patternSimplifier(text, pattern)
	if simple then
		return text:gsub(pattern, repl, n)
	else
		return mw.ustring.gsub(text, pattern, repl, n)
	end
end

--[==[
-- Reimplementation of mw.ustring.split() that includes any capturing
-- groups in the splitting pattern. This works like Python's re.split()
-- function, except that it has Lua's behavior when the split pattern
-- is empty (i.e. advancing by one character at a time; Python returns the
-- whole remainder of the string).
]==]
function export.capturing_split(str, pattern)
	local ret = {}
	-- (.-) corresponds to (.*?) in Python or Perl; () captures the
	-- current position after matching.
	pattern = "(.-)" .. pattern .. "()"
	local start = 1
	while true do
		-- Did we reach the end of the string?
		if start > #str then
			table.insert(ret, "")
			return ret
		end
		-- match() returns all captures as multiple return values;
		-- we need to insert into a table to get them all.
		local captures = {export.match(str, pattern, start)}
		-- If no match, add the remainder of the string.
		if #captures == 0 then
			table.insert(ret, export.sub(str, start))
			return ret
		end
		local newstart = table.remove(captures)
		-- Special case: If we don't advance by any characters, then advance
		-- by one character; this avoids an infinite loop, and makes splitting
		-- by an empty string work the way mw.ustring.split() does. If we
		-- reach the end of the string this way, return immediately, so we
		-- don't get a final empty string.
		if newstart == start then
			table.insert(ret, export.sub(str, start, start))
			table.remove(captures, 1)
			start = start + 1
			if start > #str then
				return ret
			end
		else
			table.insert(ret, table.remove(captures, 1))
			start = newstart
		end
		-- Insert any captures from the splitting pattern.
		for _, x in ipairs(captures) do
			table.insert(ret, x)
		end
	end
end

local function uclcfirst(text, dolower)
	local function douclcfirst(text)
		-- Actual function to re-case of the first letter.
		local first_letter = export.sub(text, 1, 1)
		first_letter = dolower and export.lower(first_letter) or export.upper(first_letter)
		return first_letter .. export.sub(text, 2)
	end
	-- If there's a link at the beginning, re-case the first letter of the
	-- link text. This pattern matches both piped and unpiped links.
	-- If the link is not piped, the second capture (linktext) will be empty.
	local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
	if link then
		return "[[" .. link .. "|" .. douclcfirst(linktext ~= "" and linktext or link) .. "]]" .. remainder
	end
	return douclcfirst(text)
end

function export.ucfirst(text)
	return uclcfirst(text, false)
end

function export.lcfirst(text)
	return uclcfirst(text, true)
end

-- Faster version of mw.text.nowiki, with minor changes to match the PHP equivalent: ";" always escapes, and colons in unslashed protocols only escape after regex \b.
do
	local function escape_char(str1, str2)
		if str2 then
			return str1 .. "&#" .. str2:byte() .. ";"
		end
		return "&#" .. str1:byte() .. ";"
	end
	
	local function escape_uri(uri)
		local uri_schemes = mw.loadData("Module:string utilities/data").uri_schemes
		return uri_schemes[uri:lower()] and uri .. "&#58;" or uri .. ":"
	end
	
	function export.nowiki(text)
		return (text
			:gsub("[\"&';<=>%[%]{|}]", escape_char)
			:gsub("^[\t\n\r #%*:]", escape_char)
			:gsub("([\n\r])([\t\n\r #%*:])", escape_char)
			:gsub("%f[^%z\r\n]%-(%-%-%-)", "&#45;%1")
			:gsub("__", "_&#95;")
			:gsub("://", "&#58;//")
			:gsub("(ISBN)(%s)", escape_char)
			:gsub("(PMID)(%s)", escape_char)
			:gsub("(RFC)(%s)", escape_char)
			:gsub("([%w_]+):", escape_uri))
	end
end

function export.capitalize(text)
	if type(text) == "table" then
		-- allow calling from a template
		text = text.args[1]
	end
	-- Capitalize multi-word that is separated by spaces
	-- by uppercasing the first letter of each part.
	-- I assume nobody will input all CAP text.
	w2 = {}
	for w in export.gmatch(text, "%S+") do
		table.insert(w2, uclcfirst(w, false))
	end
	return table.concat(w2, " ")
end

function export.pluralize(text)
	if type(text) == "table" then
		-- allow calling from a template
		text = text.args[1]
	end
	-- Pluralize a word in a smart fashion, according to normal English rules.
	-- 1. If word ends in consonant + -y, replace the -y with -ies.
	-- 2. If the word ends in -s, -x, -z, -sh, -ch, add -es.
	-- 3. Otherwise, add -s.
	-- This handles links correctly:
	-- 1. If a piped link, change the second part appropriately.
	-- 2. If a non-piped link and rule #1 above applies, convert to a piped link
	--    with the second part containing the plural.
	-- 3. If a non-piped link and rules #2 or #3 above apply, add the plural
	--    outside the link.
	
	local function word_ends_in_consonant_plus_y(text)
		-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
		-- apply to proper nouns, hence "the Gettys", "the public Ivys".
		-- We should maybe consider applying this rule here; but it may not
		-- be important as this function is almost always called on common nouns
		-- (e.g. parts of speech, place types).
		return text:find("[^aeiouAEIOU ]y$")
	end
	
	local function word_takes_es_plural(text)
		return text:find("[sxz]$") or text:find("[cs]h$")
	end
	
	local function do_pluralize(text)
		if word_ends_in_consonant_plus_y(text) then
			-- avoid returning multiple values
			local hack_single_retval = text:gsub("y$", "ies")
			return hack_single_retval
		elseif word_takes_es_plural(text) then
			return text .. "es"
		else
			return text .. "s"
		end
	end
		
	-- Check for a link. This pattern matches both piped and unpiped links.
	-- If the link is not piped, the second capture (linktext) will be empty.
	local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
	if link then
		if linktext ~= "" then
			return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
		end
		if word_ends_in_consonant_plus_y(link) then
			return beginning .. "[[" .. link .. "|" .. link:gsub("y$", "ies") .. "]]"
		end
		return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
	end
	return do_pluralize(text)
end

function export.singularize(text)
	if type(text) == "table" then
		-- allow calling from a template
		text = text.args[1]
	end
	-- Singularize a word in a smart fashion, according to normal English rules.
	-- Works analogously to pluralize().
	-- NOTE: This doesn't always work as well as pluralize(). Beware. It will
	-- mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
	-- 1. If word ends in -ies, replace -ies with -y.
	-- 2. If the word ends in -xes, -shes, -ches, remove -es. [Does not affect
	--    -ses, cf. "houses", "impasses".]
	-- 3. Otherwise, remove -s.
	-- This handles links correctly:
	-- 1. If a piped link, change the second part appropriately. Collapse the
	--    link to a simple link if both parts end up the same.
	-- 2. If a non-piped link, singularize the link.
	-- 3. A link like "[[parish]]es" will be handled correctly because the
	--    code that checks for -shes etc. allows ] characters between the
	--    'sh' etc. and final -es.
	local function do_singularize(text)
		local sing = text:match("^(.-)ies$")
		if sing then
			return sing .. "y"
		end
		-- Handle cases like "[[parish]]es"
		local sing = text:match("^(.-[sc]h%]*)es$")
		if sing then
			return sing
		end
		-- Handle cases like "[[box]]es"
		local sing = text:match("^(.-x%]*)es$")
		if sing then
			return sing
		end
		local sing = text:match("^(.-)s$")
		if sing then
			return sing
		end
		return text
	end

	local function collapse_link(link, linktext)
		if link == linktext then
			return "[[" .. link .. "]]"
		else
			return "[[" .. link .. "|" .. linktext .. "]]"
		end
	end

	-- Check for a link. This pattern matches both piped and unpiped links.
	-- If the link is not piped, the second capture (linktext) will be empty.
	local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
	if link then
		if linktext ~= "" then
			return beginning .. collapse_link(link, do_singularize(linktext))
		end
		return beginning .. "[[" .. do_singularize(link) .. "]]"
	end

	return do_singularize(text)
end


function export.add_indefinite_article(text, uppercase)
	local is_vowel = false
	-- If there's a link at the beginning, examine the first letter of the
	-- link text. This pattern matches both piped and unpiped links.
	-- If the link is not piped, the second capture (linktext) will be empty.
	local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
	if link then
		is_vowel = export.find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
	else
		is_vowel = export.find(text, "^[AEIOUaeiou]")
	end
	return (is_vowel and (uppercase and "An " or "an ") or (uppercase and "A " or "a ")) .. text
end

-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
function export.escape_risky_characters(text)
	-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software.
	if not mw.ustring.match(text, "%S") then
		return mw.text.encode(text, "%s")
	else
		return mw.text.encode(text, "!#%%&*+/:;<=>?@[\\%]_{|}")
	end
end

return export