မေႃႇၵျူး:string utilities: လွင်ႈပႅၵ်ႇပိူင်ႈ ၼႂ်းၵႄႈ လွင်ႈၶူၼ်ႉၶႆႈ
No edit summary |
No edit summary Tag: Reverted |
||
ထႅဝ် 1: | ထႅဝ် 1: | ||
local mw = mw |
|||
local string = string |
|||
local table = table |
|||
local ustring = mw.ustring |
|||
local byte = string.byte |
|||
local char = string.char |
|||
local concat = table.concat |
|||
local find = string.find |
|||
local format = string.format |
|||
local gmatch = string.gmatch |
|||
local gsub = string.gsub |
|||
local len = string.len |
|||
local load_data = mw.loadData |
|||
local lower = string.lower |
|||
local match = string.match |
|||
local next = next |
|||
local reverse = string.reverse |
|||
local select = select |
|||
local sort = table.sort |
|||
local sub = string.sub |
|||
local tonumber = tonumber |
|||
local tostring = tostring |
|||
local type = type |
|||
local ucodepoint = ustring.codepoint |
|||
local ufind = ustring.find |
|||
local ugcodepoint = ustring.gcodepoint |
|||
local ugmatch = ustring.gmatch |
|||
local ugsub = ustring.gsub |
|||
local ulower = ustring.lower |
|||
local umatch = ustring.match |
|||
local unpack = unpack |
|||
local upper = string.upper |
|||
local usub = ustring.sub |
|||
local uupper = ustring.upper |
|||
-- Defined below. |
|||
local charset_escape |
|||
local codepoint |
|||
local explode_utf8 |
|||
local format_fun |
|||
local get_indefinite_article |
|||
local pattern_escape |
|||
local pattern_simplifier |
|||
local php_trim |
|||
local replacement_escape |
|||
local u |
|||
local ulen |
|||
local module_name = "string_utilities" |
local module_name = "string_utilities" |
||
local export = {} |
local export = {} |
||
--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==] |
|||
local format_escapes = { |
|||
function export.explode_utf8(str) |
|||
["op"] = "{", |
|||
local text, i = {}, 0 |
|||
for ch in gmatch(str, ".[\128-\191]*") do |
|||
} |
|||
i = i + 1 |
|||
text[i] = ch |
|||
end |
|||
return text |
|||
end |
|||
explode_utf8 = export.explode_utf8 |
|||
--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{code|lua|"^$()%.[]*+-?"}} becomes {{code|lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==] |
|||
function export.format_fun(str, fun) |
|||
function export.pattern_escape(str) |
|||
return (str:gsub("{(\\?)((\\?)[^{}]*)}", function (p1, name, p2) |
|||
return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0")) |
|||
if #p1 + #p2 == 1 then |
|||
return format_escapes[name] or error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'") |
|||
else |
|||
if fun(name) and type(fun(name)) ~= "string" then |
|||
error(module_name .. ".format: '" .. name .. "' is a " .. type(fun(name)) .. ", not a string") |
|||
end |
|||
return fun(name) or error(module_name .. ".format: '" .. name .. "' not found in table") |
|||
end |
|||
end)) |
|||
end |
end |
||
pattern_escape = export.pattern_escape |
|||
--[==[Escapes only the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>.]==] |
|||
--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash. |
|||
function export.charset_escape(str) |
|||
====Examples==== |
|||
return (gsub(str, "[%%%-%]^]", "%%%0")) |
|||
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}} |
|||
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}} |
|||
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}} |
|||
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}} |
|||
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==] |
|||
function export.format(str, tbl) |
|||
return export.format_fun(str, function (key) return tbl[key] end) |
|||
end |
end |
||
charset_escape = export.charset_escape |
|||
--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==] |
|||
--[==[Explodes a string into an array of UTF8 characters. '''Warning''': this function has no safety checks for non-UTF8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==] |
|||
function export. |
function export.replacement_escape(str) |
||
return (gsub(str, "%%", "%%%%")) |
|||
local byte = string.byte |
|||
end |
|||
local sub = string.sub |
|||
replacement_escape = export.replacement_escape |
|||
local str_len = #str |
|||
do |
|||
local text = {} |
|||
local |
local function check_sets_equal(set1, set2) |
||
local k2 |
|||
for k1, v1 in next, set1 do |
|||
while n <= str_len do |
|||
local v2 = set2[k1] |
|||
b = byte(str, n) |
|||
if v1 ~= v2 and (v2 == nil or not check_sets_equal(v1, v2)) then |
|||
i = i + 1 |
|||
return false |
|||
if b < 0xC0 then |
|||
end |
|||
text[i] = sub(str, n, n) |
|||
k2 = next(set2, k2) |
|||
elseif b < 0xE0 then |
|||
text[i] = sub(str, n, n + 1) |
|||
n = n + 2 |
|||
elseif b < 0xF0 then |
|||
text[i] = sub(str, n, n + 2) |
|||
n = n + 3 |
|||
else |
|||
text[i] = sub(str, n, n + 3) |
|||
n = n + 4 |
|||
end |
end |
||
return next(set2, k2) == nil |
|||
end |
end |
||
local function check_sets(bytes) |
|||
return text |
|||
local key, set1, set = next(bytes) |
|||
end |
|||
if set1 == true then |
|||
return true |
|||
-- A helper function which takes a string, position and type ("byte" or "char"), and returns the equivalent position for the other type (e.g. iterate_utf8("字典", 2, "char") returns 4, because character 2 of "字典" begins with byte 4). `pos` can be positive or negative, and the function will iterate over the string forwards or backwards (respectively) until it reaches the input position. Checks byte-by-byte; skipping over trailing bytes, and then calculating the correct byte trail for any leading bytes (i.e. how many trailing bytes should follow); these trailing bytes are then checked together. |
|||
elseif not check_sets(set1) then |
|||
-- The optional parameters `init_from_type` and `init_to_type` can be used to start part-way through an iteration to improve performance, if multiple values need to be returned from the same string. For example, iterate_utf8("слова́рь", 11, "byte", 5, 3) will begin checking at byte 5/the start of character 3. Note: The function won't check if these values match each other (as the only way to do this would be to run the iteration from the beginning), so mismatched values will return incorrect results. |
|||
return false |
|||
local function iterate_utf8(text, pos, from_type, init_from_type, init_to_type) |
|||
end |
|||
-- Position 0 is always valid and never changes. |
|||
while true do |
|||
if pos == 0 then |
|||
key, set = next(bytes, key) |
|||
return pos |
|||
if not key then |
|||
return true |
|||
elseif not check_sets_equal(set, set1) then |
|||
return false |
|||
end |
|||
end |
|||
end |
end |
||
local function make_charset(range) |
|||
local to_type |
|||
if #range == 1 then |
|||
return char(range[1]) |
|||
to_type = "byte" |
|||
end |
|||
else |
|||
sort(range) |
|||
to_type = "char" |
|||
local compressed, n, start = {}, 0, range[1] |
|||
for i = 1, #range do |
|||
local this, nxt = range[i], range[i + 1] |
|||
if nxt ~= this + 1 then |
|||
n = n + 1 |
|||
compressed[n] = this == start and char(this) or |
|||
char(start) .. "-" .. char(this) |
|||
start = nxt |
|||
end |
|||
end |
|||
return "[" .. concat(compressed) .. "]" |
|||
end |
end |
||
local function parse_1_byte_charset(pattern, pos) |
|||
-- Positive positions iterate forwards; negative positions iterate backwards. |
|||
while true do |
|||
local iterate_val |
|||
local ch, nxt_pos |
|||
if pos > 0 then |
|||
pos, ch, nxt_pos = match(pattern, "()([%%%]\194-\244][\128-\191]*)()", pos) |
|||
iterate_val = 1 |
|||
if not ch then |
|||
else |
|||
return false |
|||
iterate_val = -1 |
|||
elseif ch == "%" then |
|||
if match(pattern, "^[acdlpsuwxACDLPSUWXZ\128-\255]", nxt_pos) then |
|||
return false |
|||
end |
|||
pos = pos + 2 |
|||
elseif ch == "]" then |
|||
pos = nxt_pos |
|||
return pos |
|||
else |
|||
return false |
|||
end |
|||
end |
|||
end |
end |
||
--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==] |
|||
-- Adjust init_from_type and init_to_type to the iteration before, so that matches for the position given by them will work. |
|||
pattern_simplifier = require("Module:fun").memoize(function(pattern) |
|||
local trail, cp, min, b = 0 |
|||
if type(pattern) == "number" then |
|||
local c, leading_byte = {} |
|||
return tostring(pattern) |
|||
c[from_type] = init_from_type and init_from_type ~= 0 and init_from_type - iterate_val or 0 |
|||
c[to_type] = init_to_type and init_to_type ~= 0 and init_to_type - iterate_val or 0 |
|||
while true do |
|||
if pos > 0 then |
|||
b = text:byte(c.byte + 1) |
|||
else |
|||
b = text:byte(text:len() + c.byte) |
|||
end |
end |
||
local pos, captures, start, n, output = 1, 0, 1, 0 |
|||
-- Position byte doesn't exist, so iterate the return value and return it. |
|||
while true do |
|||
local ch, nxt_pos |
|||
return c[to_type] + iterate_val |
|||
pos, ch, nxt_pos = match(pattern, "()([%%(.[\194-\244][\128-\191]*)()", pos) |
|||
elseif b < 0x80 then |
|||
if not ch then |
|||
-- 1-byte codepoint, 00-7F. |
|||
break |
|||
cp = b |
|||
min = 0 |
|||
leading_byte = true |
|||
elseif b < 0xc0 then |
|||
-- A trailing byte. |
|||
leading_byte = false |
|||
elseif b < 0xc2 then |
|||
-- An overlong encoding for a 1-byte codepoint. |
|||
error("String " .. text .. " is not UTF-8.") |
|||
elseif b < 0xe0 then |
|||
-- 2-byte codepoint, C2-DF. |
|||
trail = 1 |
|||
cp = b - 0xc0 |
|||
min = 0x80 |
|||
leading_byte = true |
|||
elseif b < 0xf0 then |
|||
-- 3-byte codepoint, E0-EF. |
|||
trail = 2 |
|||
cp = b - 0xe0 |
|||
min = 0x800 |
|||
leading_byte = true |
|||
elseif b < 0xf4 then |
|||
-- 4-byte codepoint, F0-F3. |
|||
trail = 3 |
|||
cp = b - 0xf0 |
|||
min = 0x10000 |
|||
leading_byte = true |
|||
elseif b == 0xf4 then |
|||
-- 4-byte codepoint, F4. |
|||
-- Make sure it doesn't decode to over U+10FFFF. |
|||
if text:byte(c.byte + 2) > 0x8f then |
|||
error("String " .. text .. " is not UTF-8.") |
|||
end |
end |
||
local nxt = sub(pattern, nxt_pos, nxt_pos) |
|||
trail = 3 |
|||
if ch == "%" then |
|||
if nxt == "b" then |
|||
if not match(pattern, "^()[^\128-\255][^\128-\255]", pos + 2) then |
|||
leading_byte = true |
|||
return false |
|||
else |
|||
end |
|||
-- Codepoint over U+10FFFF, or invalid byte. |
|||
pos = pos + 4 |
|||
error("String " .. text .. " is not UTF-8.") |
|||
elseif nxt == "f" then |
|||
end |
|||
pos = pos + 2 |
|||
if not match(pattern, "^()%[[^^]", pos) then |
|||
-- Check subsequent bytes for multibyte codepoints. |
|||
return false |
|||
if leading_byte then |
|||
end |
|||
local from, to |
|||
-- Only possible to convert a %f charset which is all |
|||
if pos > 0 then |
|||
-- ASCII, so use parse_1_byte_charset. |
|||
from, to = c.byte + 2, c.byte + 1 + trail |
|||
pos = parse_1_byte_charset(pattern, pos) |
|||
if not pos then |
|||
return false |
|||
end |
|||
elseif nxt == "Z" then |
|||
pos = pos + 2 |
|||
nxt = sub(pattern, pos, pos) |
|||
if nxt == "*" or nxt == "+" or nxt == "-" then |
|||
pos = pos + 1 |
|||
else |
|||
output = output or {} |
|||
n = n + 1 |
|||
if nxt == "?" then |
|||
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244]?[\128-\191]*" |
|||
pos = pos + 1 |
|||
else |
|||
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244][\128-\191]*" |
|||
end |
|||
start = pos |
|||
end |
|||
elseif find("acdlpsuwxACDLPSUWX", nxt, 1, true) then |
|||
return false |
|||
-- Skip the next character if it's ASCII. Otherwise, we will |
|||
-- still need to do length checks. |
|||
else |
|||
pos = pos + (byte(nxt) < 128 and 2 or 1) |
|||
end |
|||
elseif ch == "(" then |
|||
if nxt == ")" or captures == 32 then |
|||
return false |
|||
end |
|||
captures = captures + 1 |
|||
pos = pos + 1 |
|||
elseif ch == "." then |
|||
if nxt == "*" or nxt == "+" or nxt == "-" then |
|||
pos = pos + 2 |
|||
else |
|||
output = output or {} |
|||
n = n + 1 |
|||
if nxt == "?" then |
|||
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191]?[\128-\191]*" |
|||
pos = pos + 2 |
|||
else |
|||
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191][\128-\191]*" |
|||
pos = pos + 1 |
|||
end |
|||
start = pos |
|||
end |
|||
elseif ch == "[" then |
|||
-- Fail negative charsets. TODO: 1-byte charsets should be safe. |
|||
if nxt == "^" then |
|||
return false |
|||
-- If the first character is "%", ch_len is determined by the |
|||
-- next one instead. |
|||
elseif nxt == "%" then |
|||
nxt_pos = nxt_pos + 1 |
|||
nxt = sub(pattern, nxt_pos, nxt_pos) |
|||
end |
|||
local ch_len = #match(pattern, "^.[\128-\191]*", nxt_pos) |
|||
if ch_len == 1 then -- Single-byte charset. |
|||
pos = parse_1_byte_charset(pattern, pos + 1) |
|||
if not pos then |
|||
return false |
|||
end |
|||
else -- Multibyte charset. |
|||
local charset_pos, bytes = pos |
|||
pos = pos + 1 |
|||
while true do -- TODO: non-ASCII charset ranges. |
|||
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos) |
|||
if not ch then |
|||
return false |
|||
-- If escaped, get the next character. No need to |
|||
-- distincguish magic characters or character classes, |
|||
-- as they'll all fail for having the wrong length |
|||
-- anyway. |
|||
elseif ch == "%" then |
|||
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos) |
|||
elseif ch == "]" then |
|||
pos = nxt_pos |
|||
break |
|||
end |
|||
if ch_len ~= #ch then |
|||
return false |
|||
end |
|||
bytes = bytes or {} |
|||
local bytes = bytes |
|||
for i = 1, ch_len - 1 do |
|||
local b = byte(ch, i, i) |
|||
bytes[b] = bytes[b] or {} |
|||
bytes = bytes[b] |
|||
end |
|||
bytes[byte(ch, -1)] = true |
|||
pos = nxt_pos |
|||
end |
|||
if not pos then |
|||
return false |
|||
end |
|||
local nxt = sub(pattern, pos, pos) |
|||
if ( |
|||
(nxt == "?" or nxt == "*" or nxt == "-") or |
|||
(nxt == "+" and ch_len > 2) or |
|||
not check_sets(bytes) |
|||
) then |
|||
return false |
|||
end |
|||
local ranges, b, key, next_byte = {}, 0 |
|||
repeat |
|||
key, next_byte = next(bytes) |
|||
local range, n = {key}, 1 |
|||
-- Loop starts on the second iteration. |
|||
for key in next, bytes, key do |
|||
n = n + 1 |
|||
range[n] = key |
|||
end |
|||
b = b + 1 |
|||
ranges[b] = range |
|||
bytes = next_byte |
|||
until next_byte == true |
|||
if nxt == "+" then |
|||
local range1, range2 = ranges[1], ranges[2] |
|||
ranges[1] = make_charset(range1) |
|||
ranges[3] = make_charset(range2) |
|||
local n = #range2 |
|||
for i = 1, #range1 do |
|||
n = n + 1 |
|||
range2[n] = range1[i] |
|||
end |
|||
ranges[2] = make_charset(range2) .. "*" |
|||
pos = pos + 1 |
|||
else |
|||
for i = 1, #ranges do |
|||
ranges[i] = make_charset(ranges[i]) |
|||
end |
|||
end |
|||
output = output or {} |
|||
n = n + 1 |
|||
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) |
|||
start = pos |
|||
end |
|||
elseif nxt == "+" then |
|||
if #ch ~= 2 then |
|||
return false |
|||
end |
|||
output = output or {} |
|||
n = n + 1 |
|||
output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. sub(ch, 2, 2) |
|||
pos = nxt_pos + 1 |
|||
start = pos |
|||
elseif nxt == "?" or nxt == "*" or nxt == "-" then |
|||
return false |
|||
else |
else |
||
pos = nxt_pos |
|||
from, to = text:len() + c.byte + 1, text:len() + c.byte + trail |
|||
end |
end |
||
end |
|||
for trailing_byte = from, to do |
|||
if start == 1 then |
|||
b = text:byte(trailing_byte) |
|||
return pattern |
|||
if not b or b < 0x80 or b > 0xbf then |
|||
end |
|||
error("String " .. text .. " is not UTF-8.") |
|||
return concat(output) .. sub(pattern, start) |
|||
end |
|||
end, true) |
|||
cp = cp * 0x40 + b - 0x80 |
|||
export.pattern_simplifier = pattern_simplifier -- For testing. |
|||
end |
|||
end |
|||
local next_byte = text:byte(to + 1) |
|||
if next_byte and next_byte >= 0x80 and next_byte <= 0xbf then |
|||
function export.len(str) |
|||
-- Too many trailing bytes. |
|||
return type(str) == "number" and len(str) or |
|||
error("String " .. text .. " is not UTF-8.") |
|||
#str - #gsub(str, "[^\128-\191]+", "") |
|||
elseif cp < min then |
|||
end |
|||
-- Overlong encoding. |
|||
ulen = export.len |
|||
error("String " .. text .. " is not UTF-8.") |
|||
function export.sub(str, i, j) |
|||
str, i = type(str) == "number" and tostring(str) or str, i or 1 |
|||
if i < 0 or j and j < 0 then |
|||
return usub(str, i, j) |
|||
elseif j and i > j or i > #str then |
|||
return "" |
|||
end |
|||
local n, new_i = 0 |
|||
for loc1, loc2 in gmatch(str, "()[^\128-\191]+()[\128-\191]*") do |
|||
n = n + loc2 - loc1 |
|||
if not new_i and n >= i then |
|||
new_i = loc2 - (n - i) - 1 |
|||
if not j then |
|||
return sub(str, new_i) |
|||
end |
end |
||
end |
end |
||
if j and n > j then |
|||
return sub(str, new_i, loc2 - (n - j) - 1) |
|||
c.byte = c.byte + iterate_val |
|||
if leading_byte then |
|||
c.char = c.char + iterate_val |
|||
end |
end |
||
end |
|||
return new_i and sub(str, new_i) or "" |
|||
if c[from_type] == pos then |
|||
end |
|||
return c[to_type] |
|||
do |
|||
local function _find(str, loc1, loc2, ...) |
|||
if loc1 and not match(str, "^()[^\128-\255]*$") then |
|||
-- Use raw values of loc1 and loc2 to get loc1 and the length of the match. |
|||
loc1, loc2 = ulen(sub(str, 1, loc1)), ulen(sub(str, loc1, loc2)) |
|||
-- Offset length with loc1 to get loc2. |
|||
loc2 = loc1 + loc2 - 1 |
|||
end |
end |
||
return loc1, loc2, ... |
|||
end |
|||
--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==] |
|||
function export.find(str, pattern, init, plain) |
|||
init = init or 1 |
|||
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then |
|||
return ufind(str, pattern, init, plain) |
|||
elseif plain then |
|||
return _find(str, find(str, pattern, init, true)) |
|||
end |
|||
local simple = pattern_simplifier(pattern) |
|||
if simple then |
|||
return _find(str, find(str, simple, init)) |
|||
end |
|||
return ufind(str, pattern, init) |
|||
end |
end |
||
end |
end |
||
--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==] |
|||
--[==[Converts a character position to the equivalent byte position.]==] |
|||
function export. |
function export.match(str, pattern, init) |
||
init = init or 1 |
|||
return iterate_utf8(text, pos, "char") |
|||
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then |
|||
return umatch(str, pattern, init) |
|||
end |
|||
local simple = pattern_simplifier(pattern) |
|||
if simple then |
|||
return match(str, simple, init) |
|||
end |
|||
return umatch(str, pattern, init) |
|||
end |
end |
||
--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==] |
|||
--[==[Converts a byte position to the equivalent character position.]==] |
|||
function export. |
function export.gmatch(str, pattern) |
||
local |
local simple = pattern_simplifier(pattern) |
||
if simple then |
|||
if byte and byte >= 0x80 and byte <= 0xbf then |
|||
return gmatch(str, simple) |
|||
error("Byte " .. pos .. " is not a leading byte.") |
|||
end |
end |
||
return |
return ugmatch(str, pattern) |
||
end |
end |
||
--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==] |
|||
-- A helper function which iterates through a pattern, and returns two values: a potentially modified version of the pattern, and a boolean indicating whether the returned pattern is simple (i.e. whether it can be used with the stock string library); if not, then the pattern is complex (i.e. it must be used with the ustring library, which is much more resource-intensive). |
|||
function export.gsub(str, pattern, repl, n) |
|||
local simple = pattern_simplifier(pattern) |
|||
if simple then |
|||
-- If `plain` is set, then the pattern is treated as literal (so is always simple). Only used by find. |
|||
return gsub(str, simple, repl, n) |
|||
if plain then |
|||
return pattern, true |
|||
--If none of these are present, then the pattern has to be simple. |
|||
elseif not ( |
|||
pattern:match("%[.-[\128-\255].-%]") or |
|||
pattern:match("[\128-\255][%*%+%?%-]") or |
|||
pattern:match("%%[abcdlpsuwxACDLPSUWXZ]") or |
|||
pattern:match("%[%^[^%]]+%]") or |
|||
pattern:match("%.[^%*%+%-]") or |
|||
pattern:match("%.$") or |
|||
pattern:match("%%b.?[\128-\255]") or |
|||
pattern:match("()", 1, true) |
|||
) then |
|||
return pattern, true |
|||
end |
end |
||
return ugsub(str, pattern, repl, n) |
|||
end |
|||
-- Build up the new pattern in a table, then concatenate at the end. we do it this way, as occasionally entries get modified along the way. |
|||
local new_pattern = {} |
|||
--[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==] |
|||
local len, pos, b = pattern:len(), 0 |
|||
function export.plain_gsub(str, pattern, repl, n) |
|||
local char, next_char |
|||
return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n) |
|||
end |
|||
-- `escape` and `balanced` are counters, which ensure the effects of % or %b (respectively) are distributed over the following bytes. |
|||
-- `set` is a boolean that states whether the current byte is in a charset. |
|||
--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==] |
|||
-- `capture` keeps track of how many layers of capture groups the position is in, while `captures` keeps a tally of how many groups have been detected (due to the string library limit of 32). |
|||
function export.reverse(str) |
|||
local escape, set, balanced, capture, captures = 0, false, 0, 0, 0 |
|||
return reverse(gsub(str, "[\194-\244][\128-\191]*", reverse)) |
|||
end |
|||
while pos < len do |
|||
pos = pos + 1 |
|||
do |
|||
b = pattern:byte(pos) |
|||
local function err(cp) |
|||
if escape > 0 then escape = escape - 1 end |
|||
error("Codepoint " .. cp .. " is out of range: codepoints must be between 0x0 and 0x10FFFF.", 2) |
|||
if balanced > 0 then balanced = balanced - 1 end |
|||
end |
|||
char = next_char or pattern:sub(pos, pos) |
|||
next_char = pattern:sub(pos + 1, pos + 1) |
|||
local function utf8_char(cp) |
|||
if escape == 0 then |
|||
cp = tonumber(cp) |
|||
if char == "%" then |
|||
if cp < 0 then |
|||
err("-0x" .. format("%X", -cp + 1)) |
|||
if next_char == "." or next_char == "%" or next_char == "[" or next_char == "]" then |
|||
elseif cp < 0x80 then |
|||
escape = 2 |
|||
return char(cp) |
|||
if balanced > 0 then balanced = balanced + 1 end |
|||
elseif cp < 0x800 then |
|||
-- These charsets make the pattern complex. |
|||
return char( |
|||
elseif next_char:match("[acdlpsuwxACDLPSUWXZ]") then |
|||
0xC0 + cp / 0x40, |
|||
return pattern, false |
|||
0x80 + cp % 0x40 |
|||
) |
|||
elseif next_char == "b" then |
|||
elseif cp < 0x10000 then |
|||
balanced = 4 |
|||
if cp >= 0xD800 and cp < 0xE000 then |
|||
end |
|||
return "?" -- mw.ustring.char returns "?" for surrogates. |
|||
-- Enter or leave a charset. |
|||
elseif char == "[" then |
|||
set = true |
|||
elseif char == "]" then |
|||
set = false |
|||
elseif char == "(" then |
|||
capture = capture + 1 |
|||
elseif char == ")" then |
|||
if capture > 0 and set == false and balanced == 0 then |
|||
captures = captures + 1 |
|||
capture = capture - 1 |
|||
end |
|||
end |
end |
||
return char( |
|||
0xE0 + cp / 0x1000, |
|||
0x80 + cp / 0x40 % 0x40, |
|||
0x80 + cp % 0x40 |
|||
) |
|||
elseif cp < 0x110000 then |
|||
return char( |
|||
0xF0 + cp / 0x40000, |
|||
0x80 + cp / 0x1000 % 0x40, |
|||
0x80 + cp / 0x40 % 0x40, |
|||
0x80 + cp % 0x40 |
|||
) |
|||
end |
end |
||
err("0x" .. format("%X", cp)) |
|||
end |
|||
-- Multibyte char. |
|||
if b > 0x7f then |
|||
function export.char(cp, ...) |
|||
-- If followed by "*", "+" or "-", then 2-byte chars can be converted into charsets. However, this is not possible with 3 or 4-byte chars, as the charset would be too permissive, because if the trailing bytes were in a different order then this could be a different valid character. |
|||
if ... == nil then |
|||
return utf8_char(cp) |
|||
local prev_pos = pattern:byte(pos - 1) |
|||
if prev_pos > 0xc1 and prev_pos < 0xe0 then |
|||
new_pattern[#new_pattern] = "[" .. new_pattern[#new_pattern] |
|||
table.insert(new_pattern, char .. "]") |
|||
else |
|||
return pattern, false |
|||
end |
|||
-- If in a charset or used in "%b", then the pattern is complex. |
|||
-- If followed by "?", add "?" after each byte. |
|||
elseif next_char == "?" then |
|||
table.insert(new_pattern, char .. "?") |
|||
local check_pos, check_b, i = pos, pattern:byte(pos), #new_pattern |
|||
while check_b and check_b < 0xc0 do |
|||
check_pos = check_pos - 1 |
|||
check_b = pattern:byte(check_pos) |
|||
i = i - 1 |
|||
new_pattern[i] = new_pattern[i] .. "?" |
|||
end |
|||
pos = pos + 1 |
|||
next_char = pattern:sub(pos + 1, pos + 1) |
|||
elseif set or balanced > 0 then |
|||
return pattern, false |
|||
else |
|||
table.insert(new_pattern, char) |
|||
end |
|||
elseif char == "." then |
|||
-- "*", "+", "-" are always okay after ".", as they don't care how many bytes a char has. |
|||
if set or next_char == "*" or next_char == "+" or next_char == "-" or escape > 0 then |
|||
table.insert(new_pattern, char) |
|||
-- If followed by "?", make sure "?" is after the leading byte of the UTF-8 char pattern, then skip forward one. |
|||
elseif next_char == "?" then |
|||
table.insert(new_pattern, "[%z\1-\127\194-\244]?[\128-\191]*") |
|||
pos = pos + 1 |
|||
next_char = pattern:sub(pos + 1, pos + 1) |
|||
-- If used with "%b", pattern is complex. |
|||
elseif balanced > 0 then |
|||
return pattern, false |
|||
-- Otherwise, add the UTF-8 char pattern. |
|||
else |
|||
table.insert(new_pattern, "[%z\1-\127\194-\244][\128-\191]*") |
|||
end |
|||
-- Negative charsets are always complex, unless the text has no UTF-8 chars. |
|||
elseif char == "[" and next_char == "^" and escape == 0 and text:match("[\128-\255]") then |
|||
return pattern, false |
|||
-- "()" matches the position unless escaped or used with "%b", so always necessitates ustring (as we need it to match the char position, not the byte one). |
|||
elseif char == "(" and next_char == ")" and balanced == 0 and escape == 0 and text:match("[\128-\255]") then |
|||
return pattern, false |
|||
else |
|||
table.insert(new_pattern, char) |
|||
end |
end |
||
local ret = {cp, ...} |
|||
for i = 1, select("#", cp, ...) do |
|||
ret[i] = utf8_char(ret[i]) |
|||
end |
|||
return concat(ret) |
|||
end |
end |
||
u = export.char |
|||
if captures > 32 then |
|||
return pattern, false |
|||
else |
|||
pattern = table.concat(new_pattern) |
|||
return pattern, true |
|||
end |
|||
end |
end |
||
do |
|||
--[==[A version of len which uses string.len, but returns the same result as mw.ustring.len.]==] |
|||
function |
local function get_codepoint(b1, b2, b3, b4) |
||
if b1 < 128 then |
|||
text = tostring(text) |
|||
return b1, 1 |
|||
local len_bytes = text:len() |
|||
elseif b1 < 224 then |
|||
if not text:match("[\128-\255]") then |
|||
return |
return 0x40 * b1 + b2 - 0x3080, 2 |
||
elseif b1 < 240 then |
|||
else |
|||
return |
return 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080, 3 |
||
end |
|||
return 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080, 4 |
|||
end |
end |
||
end |
|||
function export.codepoint(str, i, j) |
|||
--[==[A version of sub which uses string.sub, but returns the same result as mw.ustring.sub.]==] |
|||
if type(str) == "number" then |
|||
function export.sub(text, i_char, j_char) |
|||
return byte(str, i, j) |
|||
text = tostring(text) |
|||
end |
|||
if not text:match("[\128-\255]") then |
|||
i, j = i or 1, j == -1 and #str or i or 1 |
|||
return text:sub(i_char, j_char) |
|||
if i == 1 and j == 1 then |
|||
return (get_codepoint(byte(str, 1, 4))) |
|||
elseif i < 0 or j < 0 then |
|||
return ucodepoint(str, i, j) -- FIXME |
|||
end |
|||
local n, nb, ret, nr = 0, 1, {}, 0 |
|||
while n < j do |
|||
n = n + 1 |
|||
if n < i then |
|||
local b = byte(str, nb) |
|||
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4) |
|||
else |
|||
local b1, b2, b3, b4 = byte(str, nb, nb + 3) |
|||
if not b1 then |
|||
break |
|||
end |
|||
nr = nr + 1 |
|||
local add |
|||
ret[nr], add = get_codepoint(b1, b2, b3, b4) |
|||
nb = nb + add |
|||
end |
|||
end |
|||
return unpack(ret) |
|||
end |
end |
||
codepoint = export.codepoint |
|||
local i_byte, j_byte |
|||
if j_char then |
|||
function export.gcodepoint(str, i, j) |
|||
if i_char > 0 and j_char > 0 then |
|||
i, j = i or 1, j ~= -1 and j or nil |
|||
if j_char < i_char then return "" end |
|||
if i < 0 or j and j < 0 then |
|||
i_byte = iterate_utf8(text, i_char, "char") |
|||
return ugcodepoint(str, i, j) -- FIXME |
|||
j_byte = iterate_utf8(text, j_char + 1, "char", i_char, i_byte) - 1 |
|||
end |
|||
elseif i_char < 0 and j_char < 0 then |
|||
local n, nb = 1, 1 |
|||
if j_char < i_char then return "" end |
|||
while n < i do |
|||
j_byte = iterate_utf8(text, j_char + 1, "char") - 1 |
|||
local b = byte(str, nb) |
|||
i_byte = iterate_utf8(text, i_char, "char", j_char, j_byte) |
|||
if not b then |
|||
-- For some reason, mw.ustring.sub with i=0, j=0 returns the same result as for i=1, j=1, while string.sub always returns "". However, mw.ustring.sub does return "" with i=1, j=0. As such, we need to adjust j_char to 1 if i_char is either 0, or negative with a magnitude greater than the length of the string. |
|||
break |
|||
elseif j_char == 0 then |
|||
end |
|||
i_byte = iterate_utf8(text, i_char, "char") |
|||
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4) |
|||
if i_byte == 0 or -i_byte > text:len() then j_char = 1 end |
|||
n = n + 1 |
|||
j_byte = iterate_utf8(text, j_char + 1, "char") - 1 |
|||
end |
|||
i_byte = iterate_utf8(text, i_char, "char") |
|||
return function() |
|||
j_byte = iterate_utf8(text, j_char + 1, "char") - 1 |
|||
if j and n > j then |
|||
return nil |
|||
end |
|||
n = n + 1 |
|||
local b1, b2, b3, b4 = byte(str, nb, nb + 3) |
|||
if not b1 then |
|||
return nil |
|||
end |
|||
local ret, add = get_codepoint(b1, b2, b3, b4) |
|||
nb = nb + add |
|||
return ret |
|||
end |
end |
||
else |
|||
i_byte = iterate_utf8(text, i_char, "char") |
|||
end |
end |
||
return text:sub(i_byte, j_byte) |
|||
end |
end |
||
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==] |
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==] |
||
function export.lower( |
function export.lower(str) |
||
return (match(str, "^()[^\128-\255]*$") and lower or ulower)(str) |
|||
text = tostring(text) |
|||
if not text:match("[\128-\255]") then |
|||
return text:lower() |
|||
else |
|||
return mw.ustring.lower(text) |
|||
end |
|||
end |
end |
||
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==] |
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==] |
||
function export.upper( |
function export.upper(str) |
||
return (match(str, "^()[^\128-\255]*$") and upper or uupper)(str) |
|||
text = tostring(text) |
|||
if not text:match("[\128-\255]") then |
|||
return text:upper() |
|||
else |
|||
return mw.ustring.upper(text) |
|||
end |
|||
end |
end |
||
do |
|||
--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==] |
|||
function |
local function add_captures(text, n, ...) |
||
-- Insert any captures from the splitting pattern. |
|||
text = tostring(text) |
|||
local offset, capture = n - 1, ... |
|||
local simple |
|||
while capture do |
|||
pattern, simple = patternSimplifier(text, pattern, plain) |
|||
n = n + 1 |
|||
-- If the pattern is simple but multibyte characters are present, then init_char needs to be converted into bytes for string.find to work properly, and the return values need to be converted back into chars. |
|||
text[n] = capture |
|||
if simple then |
|||
capture = select(n - offset, ...) |
|||
if not text:match("[\128-\255]") then |
|||
end |
|||
return text:find(pattern, init_char, plain) |
|||
return n |
|||
end |
|||
local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...) |
|||
if not (loc1 and start <= str_len) then |
|||
-- If no match, or there is but we're past the end of the string |
|||
-- (which happens when the match is the empty string), then add |
|||
-- the final chunk and return. |
|||
n = n + 1 |
|||
text[n] = _sub(str, start) |
|||
return |
|||
elseif loc2 < loc1 then |
|||
-- Special case: If we match the empty string, then include the |
|||
-- next character; this avoids an infinite loop, and makes |
|||
-- splitting by an empty string work the way mw.text.split() does |
|||
-- (including non-adjacent empty string matches with %f). If we |
|||
-- reach the end of the string this way, return immediately, so we |
|||
-- don't get a final empty string. If using the string library, we |
|||
-- need to make sure we advance by one UTF-8 character. |
|||
if _sub == sub then |
|||
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1) |
|||
end |
|||
n = n + 1 |
|||
text[n] = _sub(str, start, loc1) |
|||
start = loc1 + 1 |
|||
if start > str_len then |
|||
return ... and add_captures(text, n, ...) or n |
|||
end |
|||
else |
else |
||
-- Add chunk up to the current match. |
|||
local init_byte = init_char and iterate_utf8(text, init_char, "char") |
|||
n = n + 1 |
|||
local byte1, byte2, c1, c2, c3, c4, c5, c6, c7, c8, c9 = text:find(pattern, init_byte, plain) |
|||
text[n] = _sub(str, start, loc1 - 1) |
|||
start = loc2 + 1 |
|||
-- If string.find returned nil, then return nil. |
|||
end |
|||
if not (byte1 and byte2) then |
|||
return (... and add_captures(text, n, ...) or n), start |
|||
return nil |
|||
end |
|||
local function _split(str, pattern, str_len, _sub, _find, plain) |
|||
local text, n, start = {}, 0, 1 |
|||
repeat |
|||
n, start = iterate(str, str_len, text, n, start, _sub, _find(str, pattern, start, plain)) |
|||
until not start |
|||
return text |
|||
end |
|||
--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==] |
|||
function export.split(str, pattern, str_lib, plain) |
|||
if str_lib or plain then |
|||
return _split(str, pattern, #str, sub, find, plain) |
|||
end |
|||
local simple = pattern_simplifier(pattern) |
|||
if simple then |
|||
return _split(str, simple, #str, sub, find) |
|||
end |
|||
return _split(str, pattern, ulen(str), usub, ufind) |
|||
end |
|||
export.capturing_split = export.split -- To be removed. |
|||
end |
|||
do |
|||
-- TODO: merge this with export.split. Not clear how to do this while |
|||
-- maintaining the same level of performance, as gsplit is slower. |
|||
local function _split(str, pattern, str_len, _sub, _find, plain) |
|||
local start, final = 1 |
|||
local function iter(loc1, loc2, ...) |
|||
-- If no match, return the final chunk. |
|||
if not loc1 then |
|||
final = true |
|||
return _sub(str, start) |
|||
end |
end |
||
-- Special case: If we match the empty string, then eat the |
|||
-- next character; this avoids an infinite loop, and makes |
|||
-- Get first return value. If we have a positive init_char, we can save resources by resuming at that point. |
|||
-- splitting by the empty string work the way mw.text.gsplit() does |
|||
local char1, char2 |
|||
-- (including non-adjacent empty string matches with %f). If we |
|||
if (not init_char) or init_char > 0 then |
|||
-- reach the end of the string this way, set `final` to true, so we |
|||
char1 = iterate_utf8(text, byte1, "byte", init_byte, init_char) |
|||
-- don't get stuck matching the empty string at the end. |
|||
local chunk |
|||
if loc2 < loc1 then |
|||
-- If using the string library, we need to make sure we advance |
|||
-- by one UTF-8 character. |
|||
if _sub == sub then |
|||
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1) |
|||
end |
|||
chunk = _sub(str, start, loc1) |
|||
if loc1 >= str_len then |
|||
final = true |
|||
else |
|||
start = loc1 + 1 |
|||
end |
|||
-- Eat chunk up to the current match. |
|||
else |
else |
||
chunk = _sub(str, start, loc1 - 1) |
|||
start = loc2 + 1 |
|||
end |
end |
||
return chunk, ... |
|||
end |
|||
-- If byte1 and byte2 are the same, don't bother running iterate_utf8 twice. Otherwise, resume iterate_utf8 from byte1 to find char2. |
|||
if byte1 == byte2 then |
|||
return function() |
|||
char2 = char1 |
|||
if not final then |
|||
else |
|||
return iter(_find(str, pattern, start, plain)) |
|||
end |
end |
||
return nil |
|||
return unpack{char1, char2, c1, c2, c3, c4, c5, c6, c7, c8, c9} |
|||
end |
end |
||
end |
|||
else |
|||
return mw.ustring.find(text, pattern, init_char, plain) |
|||
function export.gsplit(str, pattern, str_lib, plain) |
|||
if str_lib or plain then |
|||
return _split(str, pattern, #str, sub, find, plain) |
|||
end |
|||
local simple = pattern_simplifier(pattern) |
|||
if simple then |
|||
return _split(str, simple, #str, sub, find) |
|||
end |
|||
return _split(str, pattern, ulen(str), usub, ufind) |
|||
end |
end |
||
end |
end |
||
function export.trim(str, charset) |
|||
--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==] |
|||
if not charset then |
|||
function export.match(text, pattern, init) |
|||
return match(str, "^()%s*$") and "" or match(str, "^%s*(.*%S)") |
|||
text = tostring(text) |
|||
elseif match(charset, "^()[^\128-\255]*$") then |
|||
local simple |
|||
return match(str, "^()[" .. charset .. "]*$") and "" or match(str, "^[" .. charset .. "]*(.*[^" .. charset .. "])") |
|||
pattern, simple = patternSimplifier(text, pattern) |
|||
if simple then |
|||
if init and text:find("[\128-\255]") then |
|||
init = iterate_utf8(text, init, "char") |
|||
end |
|||
return text:match(pattern, init) |
|||
else |
|||
return mw.ustring.match(text, pattern, init) |
|||
end |
end |
||
return umatch(str, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$") |
|||
end |
end |
||
do |
|||
--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==] |
|||
local entities |
|||
function export.gmatch(text, pattern) |
|||
text = tostring(text) |
|||
local function decode_numeric_entity(code, pattern, base) |
|||
local simple |
|||
local cp = match(code, pattern) and tonumber(code, base) |
|||
pattern, simple = patternSimplifier(text, pattern) |
|||
return cp and cp < 0x110000 and u(cp) or nil |
|||
if simple then |
|||
end |
|||
return text:gmatch(pattern) |
|||
else |
|||
local function decode_entity(hash, x, code) |
|||
return mw.ustring.gmatch(text, pattern) |
|||
if hash == "#" then |
|||
return x == "" and decode_numeric_entity(code, "^%d+$") or |
|||
decode_numeric_entity(code, "^%x+$", 16) |
|||
end |
|||
entities = entities or load_data("Module:data/entities") |
|||
return entities[x .. code] |
|||
end |
|||
-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]]. |
|||
function export.decode_entities(str) |
|||
return find(str, "&", 1, true) and |
|||
gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str |
|||
end |
end |
||
end |
end |
||
do |
|||
--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==] |
|||
local html_entities |
|||
function export.gsub(text, pattern, repl, n) |
|||
text = tostring(text) |
|||
local |
local function encode_entity(ch) |
||
local entity = html_entities[ch] |
|||
pattern, simple = patternSimplifier(text, pattern) |
|||
if entity then |
|||
return |
return entity |
||
end |
|||
else |
|||
entity = "&#" .. codepoint(ch) .. ";" |
|||
return mw.ustring.gsub(text, pattern, repl, n) |
|||
html_entities[ch] = entity |
|||
return entity |
|||
end |
|||
function export.encode_entities(str, charset, str_lib, plain) |
|||
-- Memoized HTML entities (taken from mw.text.lua). |
|||
html_entities = html_entities or { |
|||
["\""] = """, |
|||
["&"] = "&", |
|||
["'"] = "'", |
|||
["<"] = "<", |
|||
[">"] = ">", |
|||
["\194\160"] = " ", |
|||
} |
|||
if not charset then |
|||
return (gsub(str, "[\"&'<>\194]\160?", html_entities)) |
|||
elseif plain then |
|||
return (gsub(str, "[" .. charset_escape(charset) .. "]", encode_entity)) |
|||
elseif str_lib then |
|||
if not match(charset, "^()[^\128-\255]*$") then |
|||
error("Cannot use the string library with a character set that contains a character with a codepoint above U+007F.") |
|||
end |
|||
return (gsub(str, "[" .. charset .. "]", encode_entity)) |
|||
end |
|||
local pattern = charset and "[" .. charset .. "]" |
|||
local simple = pattern_simplifier(pattern) |
|||
if simple then |
|||
return (gsub(str, simple, encode_entity)) |
|||
end |
|||
return (ugsub(str, pattern, encode_entity)) |
|||
end |
end |
||
end |
end |
||
do |
|||
--[==[ |
|||
local function decode_path(code) |
|||
-- Reimplementation of mw.ustring.split() that includes any capturing |
|||
return char(tonumber(code, 16)) |
|||
-- groups in the splitting pattern. This works like Python's re.split() |
|||
end |
|||
-- function, except that it has Lua's behavior when the split pattern |
|||
-- is empty (i.e. advancing by one character at a time; Python returns the |
|||
local function decode(lead, trail) |
|||
-- whole remainder of the string). |
|||
if lead == "+" or lead == "_" then |
|||
]==] |
|||
return " " .. trail |
|||
function export.capturing_split(str, pattern) |
|||
elseif #trail == 2 then |
|||
local ret = {} |
|||
return decode_path(trail) |
|||
-- (.-) corresponds to (.*?) in Python or Perl; () captures the |
|||
-- current position after matching. |
|||
pattern = "(.-)" .. pattern .. "()" |
|||
local start = 1 |
|||
while true do |
|||
-- Did we reach the end of the string? |
|||
if start > #str then |
|||
table.insert(ret, "") |
|||
return ret |
|||
end |
end |
||
return lead .. trail |
|||
-- match() returns all captures as multiple return values; |
|||
end |
|||
-- we need to insert into a table to get them all. |
|||
local captures = {export.match(str, pattern, start)} |
|||
function export.decode_uri(str, enctype) |
|||
-- If no match, add the remainder of the string. |
|||
enctype = enctype and upper(enctype) or "QUERY" |
|||
if #captures == 0 then |
|||
if enctype == "PATH" then |
|||
table.insert(ret, export.sub(str, start)) |
|||
return |
return find(str, "%", 1, true) and |
||
gsub(str, "%%(%x%x)", decode_path) or str |
|||
elseif enctype == "QUERY" then |
|||
return (find(str, "%", 1, true) or find(str, "+", 1, true)) and |
|||
gsub(str, "([%%%+])(%x?%x?)", decode) or str |
|||
elseif enctype == "WIKI" then |
|||
return (find(str, "%", 1, true) or find(str, "_", 1, true)) and |
|||
gsub(str, "([%%_])(%x?%x?)", decode) or str |
|||
end |
|||
error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2) |
|||
end |
|||
end |
|||
do |
|||
local function _remove_comments(str, pre) |
|||
local head = find(str, "<!--", 1, true) |
|||
if not head then |
|||
return str |
|||
end |
end |
||
local |
local ret, n = {sub(str, 1, head - 1)}, 1 |
||
while true do |
|||
-- Special case: If we don't advance by any characters, then advance |
|||
local loc = find(str, "-->", head + 4, true) |
|||
-- by one character; this avoids an infinite loop, and makes splitting |
|||
if not loc then |
|||
-- by an empty string work the way mw.ustring.split() does. If we |
|||
return pre and concat(ret) or |
|||
-- reach the end of the string this way, return immediately, so we |
|||
concat(ret) .. sub(str, head) |
|||
-- don't get a final empty string. |
|||
if newstart == start then |
|||
table.insert(ret, export.sub(str, start, start)) |
|||
table.remove(captures, 1) |
|||
start = start + 1 |
|||
if start > #str then |
|||
return ret |
|||
end |
end |
||
head = loc + 3 |
|||
else |
|||
loc = find(str, "<!--", head, true) |
|||
table.insert(ret, table.remove(captures, 1)) |
|||
if not loc then |
|||
return concat(ret) .. sub(str, head) |
|||
end |
|||
n = n + 1 |
|||
ret[n] = sub(str, head, loc - 1) |
|||
head = loc |
|||
end |
end |
||
end |
|||
-- Insert any captures from the splitting pattern. |
|||
for _, x in ipairs(captures) do |
|||
--[==[Removes any HTML comments from the input text. `stage` can be one of three options: |
|||
table.insert(ret, x) |
|||
* {{lua|"PRE"}} (default) applies the method used by MediaWiki's preprocessor: all {{code||<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code||<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead). |
|||
* {{lua|"POST"}} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code||<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code||<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code||<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {{lua|"PRE"}} method will have already been applied by the native parser. |
|||
* {{lua|"BOTH"}} applies {{lua|"PRE"}} then {{lua|"POST"}}.]==] |
|||
function export.remove_comments(str, stage) |
|||
if not stage or stage == "PRE" then |
|||
return _remove_comments(str, true) |
|||
end |
|||
local processed = stage == "POST" and _remove_comments(str) or |
|||
stage == "BOTH" and _remove_comments(str, true) or |
|||
error("bad argument #2 to \"remove_comments\" (expected PRE, POST, or BOTH)", 2) |
|||
while processed ~= str do |
|||
str = processed |
|||
processed = _remove_comments(str) |
|||
end |
end |
||
return str |
|||
end |
end |
||
end |
end |
||
--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{code|lua|"\0"}}, {{code|lua|"\t"}}, {{code|lua|"\n"}}, {{code|lua|"\v"}}, {{code|lua|"\r"}} and {{code|lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==] |
|||
local function uclcfirst(text, dolower) |
|||
function export.php_trim(str) |
|||
return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or "" |
|||
-- Actual function to re-case of the first letter. |
|||
end |
|||
local first_letter = export.sub(text, 1, 1) |
|||
php_trim = export.php_trim |
|||
first_letter = dolower and export.lower(first_letter) or export.upper(first_letter) |
|||
return first_letter .. export.sub(text, 2) |
|||
--[==[Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{code|lua|frame.args}} table). For example, {{code|lua|"1"}} is normalized to {{code|lua|1}} (a number), and {{code|lua|" foo "}} is normalized to {{code|lua|"foo"}}. If the input is not a string, it is returned unchanged. |
|||
After being trimmed with {{code|lua|export.php_trim}}, strings are converted to numbers if: |
|||
# They are integers, with no decimals (2.0) or leading zeroes (02). |
|||
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}. |
|||
# For positive values, they do not have a leading {{code|lua|+}} sign.]==] |
|||
function export.scribunto_param_key(key) |
|||
if type(key) ~= "string" then |
|||
return key |
|||
end |
end |
||
key = php_trim(key) |
|||
-- If there's a link at the beginning, re-case the first letter of the |
|||
if match(key, "^-?[1-9]%d*$") then |
|||
-- link text. This pattern matches both piped and unpiped links. |
|||
local num = tonumber(key) |
|||
-- If the link is not piped, the second capture (linktext) will be empty. |
|||
-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true. |
|||
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$") |
|||
return ( |
|||
if link then |
|||
num <= 9007199254740991 and num >= -9007199254740991 or |
|||
return "[[" .. link .. "|" .. douclcfirst(linktext ~= "" and linktext or link) .. "]]" .. remainder |
|||
key == "9007199254740992" or |
|||
key == "-9007199254740992" |
|||
) and num or key |
|||
elseif key == "0" then |
|||
return 0 |
|||
end |
end |
||
return |
return key |
||
end |
end |
||
do |
|||
function export.ucfirst(text) |
|||
local byte_escapes |
|||
return uclcfirst(text, false) |
|||
local function escape_byte(b) |
|||
return byte_escapes[b] or format("\\%03d", byte(b)) |
|||
end |
|||
function export.escape_bytes(str) |
|||
byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes |
|||
return (gsub(str, ".", escape_byte)) |
|||
end |
|||
end |
end |
||
function export. |
function export.format_fun(str, fun) |
||
return (gsub(str, "{(\\?)((\\?)[^{}]*)}", function(p1, name, p2) |
|||
return uclcfirst(text, true) |
|||
if #p1 + #p2 == 1 then |
|||
return name == "op" and "{" or |
|||
name == "cl" and "}" or |
|||
error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'") |
|||
elseif fun(name) and type(fun(name)) ~= "string" then |
|||
error(module_name .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string") |
|||
end |
|||
return fun(name) or error(module_name .. ".format: \"" .. name .. "\" not found in table") |
|||
end)) |
|||
end |
|||
format_fun = export.format_fun |
|||
--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash. |
|||
====Examples==== |
|||
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}} |
|||
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}} |
|||
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}} |
|||
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}} |
|||
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==] |
|||
function export.format(str, tbl) |
|||
return format_fun(str, function(key) |
|||
return tbl[key] |
|||
end) |
|||
end |
end |
||
-- Faster version of mw.text.nowiki, with minor changes to match the PHP equivalent: ";" always escapes, and colons in unslashed protocols only escape after regex \b. |
|||
do |
do |
||
local function |
local function do_uclcfirst(str, case_func) |
||
-- Actual function to re-case of the first letter. |
|||
if str2 then |
|||
local first_letter = case_func(match(str, "^.[\128-\191]*") or "") |
|||
return str1 .. "&#" .. str2:byte() .. ";" |
|||
return first_letter .. sub(str, #first_letter + 1) |
|||
end |
|||
return "&#" .. str1:byte() .. ";" |
|||
end |
end |
||
local function |
local function uclcfirst(str, case_func) |
||
-- If there's a link at the beginning, re-case the first letter of the |
|||
local uri_schemes = mw.loadData("Module:string utilities/data").uri_schemes |
|||
-- link text. This pattern matches both piped and unpiped links. |
|||
return uri_schemes[uri:lower()] and uri .. ":" or uri .. ":" |
|||
-- If the link is not piped, the second capture (linktext) will be empty. |
|||
local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$") |
|||
if link then |
|||
return "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder |
|||
end |
|||
return do_uclcfirst(str, case_func) |
|||
end |
end |
||
function export. |
function export.ucfirst(str) |
||
return ( |
return uclcfirst(str, uupper) |
||
:gsub("[\"&';<=>%[%]{|}]", escape_char) |
|||
:gsub("^[\t\n\r #%*:]", escape_char) |
|||
:gsub("([\n\r])([\t\n\r #%*:])", escape_char) |
|||
:gsub("%f[^%z\r\n]%-(%-%-%-)", "-%1") |
|||
:gsub("__", "__") |
|||
:gsub("://", "://") |
|||
:gsub("(ISBN)(%s)", escape_char) |
|||
:gsub("(PMID)(%s)", escape_char) |
|||
:gsub("(RFC)(%s)", escape_char) |
|||
:gsub("([%w_]+):", escape_uri)) |
|||
end |
end |
||
end |
|||
function export. |
function export.lcfirst(str) |
||
return uclcfirst(str, ulower) |
|||
if type(text) == "table" then |
|||
-- allow calling from a template |
|||
text = text.args[1] |
|||
end |
end |
||
-- Capitalize multi-word that is separated by spaces |
|||
local function capitalize(w) |
|||
-- by uppercasing the first letter of each part. |
|||
return uclcfirst(w, uupper) |
|||
-- I assume nobody will input all CAP text. |
|||
end |
|||
w2 = {} |
|||
for w in export.gmatch(text, "%S+") do |
|||
--[==[Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.]==] |
|||
table.insert(w2, uclcfirst(w, false)) |
|||
function export.capitalize(str) |
|||
if type(str) == "table" then |
|||
-- allow calling from a template |
|||
str = str.args[1] |
|||
end |
|||
-- Capitalize multi-word that is separated by spaces |
|||
-- by uppercasing the first letter of each part. |
|||
-- I assume nobody will input all CAP text. |
|||
return (ugsub(str, "%S+", capitalize)) |
|||
end |
end |
||
return table.concat(w2, " ") |
|||
end |
end |
||
do |
|||
function export.pluralize(text) |
|||
local function word_ends_in_consonant_plus_y(str) |
|||
if type(text) == "table" then |
|||
-- allow calling from a template |
|||
text = text.args[1] |
|||
end |
|||
-- Pluralize a word in a smart fashion, according to normal English rules. |
|||
-- 1. If word ends in consonant + -y, replace the -y with -ies. |
|||
-- 2. If the word ends in -s, -x, -z, -sh, -ch, add -es. |
|||
-- 3. Otherwise, add -s. |
|||
-- This handles links correctly: |
|||
-- 1. If a piped link, change the second part appropriately. |
|||
-- 2. If a non-piped link and rule #1 above applies, convert to a piped link |
|||
-- with the second part containing the plural. |
|||
-- 3. If a non-piped link and rules #2 or #3 above apply, add the plural |
|||
-- outside the link. |
|||
local function word_ends_in_consonant_plus_y(text) |
|||
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't |
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't |
||
-- apply to proper nouns, hence "the Gettys", "the public Ivys". |
-- apply to proper nouns, hence "the Gettys", "the public Ivys". |
||
ထႅဝ် 605: | ထႅဝ် 964: | ||
-- be important as this function is almost always called on common nouns |
-- be important as this function is almost always called on common nouns |
||
-- (e.g. parts of speech, place types). |
-- (e.g. parts of speech, place types). |
||
return |
return find(str, "[^aeiouAEIOU ]y$") |
||
end |
end |
||
local function word_takes_es_plural( |
local function word_takes_es_plural(str) |
||
return |
return find(str, "[sxz]$") or find(str, "[cs]h$") |
||
end |
end |
||
local function do_pluralize( |
local function do_pluralize(str) |
||
if word_ends_in_consonant_plus_y( |
if word_ends_in_consonant_plus_y(str) then |
||
-- avoid returning multiple values |
-- avoid returning multiple values |
||
return (gsub(str, "y$", "ies")) |
|||
elseif word_takes_es_plural(str) then |
|||
return hack_single_retval |
|||
return str .. "es" |
|||
elseif word_takes_es_plural(text) then |
|||
return text .. "es" |
|||
else |
|||
return text .. "s" |
|||
end |
end |
||
return str .. "s" |
|||
end |
|||
end |
|||
-- Check for a link. This pattern matches both piped and unpiped links. |
|||
--[==[ |
|||
-- If the link is not piped, the second capture (linktext) will be empty. |
|||
Pluralize a word in a smart fashion, according to normal English rules. |
|||
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$") |
|||
# If word ends in consonant + -y, replace the -y with -ies. |
|||
if link then |
|||
# If the word ends in -s, -x, -z, -sh, -ch, add -es. |
|||
if linktext ~= "" then |
|||
# Otherwise, add -s. |
|||
This handles links correctly: |
|||
# If a piped link, change the second part appropriately. |
|||
# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural. |
|||
# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link. |
|||
]==] |
|||
function export.pluralize(str) |
|||
if type(str) == "table" then |
|||
-- allow calling from a template |
|||
str = str.args[1] |
|||
end |
|||
-- Check for a link. This pattern matches both piped and unpiped links. |
|||
-- If the link is not piped, the second capture (linktext) will be empty. |
|||
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$") |
|||
if not link then |
|||
return do_pluralize(str) |
|||
elseif linktext ~= "" then |
|||
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]" |
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]" |
||
elseif word_ends_in_consonant_plus_y(link) then |
|||
end |
|||
return beginning .. "[[" .. link .. "|" .. gsub(link, "y$", "ies") .. "]]" |
|||
if word_ends_in_consonant_plus_y(link) then |
|||
return beginning .. "[[" .. link .. "|" .. link:gsub("y$", "ies") .. "]]" |
|||
end |
end |
||
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s") |
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s") |
||
end |
end |
||
return do_pluralize(text) |
|||
end |
end |
||
do |
|||
function export.singularize(text) |
|||
local function do_singularize(str) |
|||
if type(text) == "table" then |
|||
local sing = match(str, "^(.-)ies$") |
|||
-- allow calling from a template |
|||
text = text.args[1] |
|||
end |
|||
-- Singularize a word in a smart fashion, according to normal English rules. |
|||
-- Works analogously to pluralize(). |
|||
-- NOTE: This doesn't always work as well as pluralize(). Beware. It will |
|||
-- mishandle cases like "passes" -> "passe", "eyries" -> "eyry". |
|||
-- 1. If word ends in -ies, replace -ies with -y. |
|||
-- 2. If the word ends in -xes, -shes, -ches, remove -es. [Does not affect |
|||
-- -ses, cf. "houses", "impasses".] |
|||
-- 3. Otherwise, remove -s. |
|||
-- This handles links correctly: |
|||
-- 1. If a piped link, change the second part appropriately. Collapse the |
|||
-- link to a simple link if both parts end up the same. |
|||
-- 2. If a non-piped link, singularize the link. |
|||
-- 3. A link like "[[parish]]es" will be handled correctly because the |
|||
-- code that checks for -shes etc. allows ] characters between the |
|||
-- 'sh' etc. and final -es. |
|||
local function do_singularize(text) |
|||
local sing = text:match("^(.-)ies$") |
|||
if sing then |
if sing then |
||
return sing .. "y" |
return sing .. "y" |
||
end |
end |
||
-- Handle cases like "[[parish]]es" |
-- Handle cases like "[[parish]]es" |
||
return match(str, "^(.-[sc]h%]*)es$") or |
|||
if sing then |
|||
return sing |
|||
end |
|||
-- Handle cases like "[[box]]es" |
-- Handle cases like "[[box]]es" |
||
match(str, "^(.-x%]*)es$") or |
|||
-- Handle regular plurals |
|||
if sing then |
|||
match(str, "^(.-)s$") or |
|||
return sing |
|||
-- Otherwise, return input |
|||
end |
|||
str |
|||
local sing = text:match("^(.-)s$") |
|||
if sing then |
|||
return sing |
|||
end |
|||
return text |
|||
end |
end |
||
local function collapse_link(link, linktext) |
local function collapse_link(link, linktext) |
||
if link == linktext then |
if link == linktext then |
||
return "[[" .. link .. "]]" |
return "[[" .. link .. "]]" |
||
else |
|||
return "[[" .. link .. "|" .. linktext .. "]]" |
|||
end |
end |
||
return "[[" .. link .. "|" .. linktext .. "]]" |
|||
end |
end |
||
--[==[ |
|||
Singularize a word in a smart fashion, according to normal English rules. Works analogously to {pluralize()}. |
|||
'''NOTE''': This doesn't always work as well as {pluralize()}. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry". |
|||
-- Check for a link. This pattern matches both piped and unpiped links. |
|||
# If word ends in -ies, replace -ies with -y. |
|||
-- If the link is not piped, the second capture (linktext) will be empty. |
|||
# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".] |
|||
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$") |
|||
# Otherwise, remove -s. |
|||
if link then |
|||
if linktext ~= "" then |
|||
This handles links correctly: |
|||
# If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same. |
|||
# If a non-piped link, singularize the link. |
|||
# A link like "[[parish]]es" will be handled correctly because the code that checks for -shes etc. allows ] characters between the |
|||
'sh' etc. and final -es. |
|||
]==] |
|||
function export.singularize(str) |
|||
if type(str) == "table" then |
|||
-- allow calling from a template |
|||
str = str.args[1] |
|||
end |
|||
-- Check for a link. This pattern matches both piped and unpiped links. |
|||
-- If the link is not piped, the second capture (linktext) will be empty. |
|||
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$") |
|||
if not link then |
|||
return do_singularize(str) |
|||
elseif linktext ~= "" then |
|||
return beginning .. collapse_link(link, do_singularize(linktext)) |
return beginning .. collapse_link(link, do_singularize(linktext)) |
||
end |
end |
||
return beginning .. "[[" .. do_singularize(link) .. "]]" |
return beginning .. "[[" .. do_singularize(link) .. "]]" |
||
end |
end |
||
return do_singularize(text) |
|||
end |
end |
||
--[==[ |
|||
Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text. |
|||
function export.add_indefinite_article(text, uppercase) |
|||
Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with |
|||
a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase. |
|||
]==] |
|||
function export.get_indefinite_article(str, ucfirst) |
|||
str = str or "" |
|||
local is_vowel = false |
local is_vowel = false |
||
-- If there's a link at the beginning, examine the first letter of the |
-- If there's a link at the beginning, examine the first letter of the |
||
-- link text. This pattern matches both piped and unpiped links. |
-- link text. This pattern matches both piped and unpiped links. |
||
-- If the link is not piped, the second capture (linktext) will be empty. |
-- If the link is not piped, the second capture (linktext) will be empty. |
||
local link, linktext |
local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]") |
||
if link then |
if link then |
||
is_vowel = |
is_vowel = find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]") |
||
else |
else |
||
is_vowel = |
is_vowel = find(str, "^[AEIOUaeiou]") |
||
end |
end |
||
return |
return is_vowel and (ucfirst and "An" or "an") or (ucfirst and "A" or "a") |
||
end |
end |
||
get_indefinite_article = export.get_indefinite_article |
|||
--[==[ |
|||
-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.). |
|||
Prefix `text` with the appropriate indefinite article to prefix to `text`. Correctly handles links and capitalized |
|||
function export.escape_risky_characters(text) |
|||
text. Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning |
|||
-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software. |
|||
with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase. |
|||
if not mw.ustring.match(text, "%S") then |
|||
]==] |
|||
return mw.text.encode(text, "%s") |
|||
function export.add_indefinite_article(text, ucfirst) |
|||
else |
|||
return get_indefinite_article(text, ucfirst) .. " " .. text |
|||
return mw.text.encode(text, "!#%%&*+/:;<=>?@[\\%]_{|}") |
|||
end |
|||
end |
end |
||
ၶိုၼ်းၶူၼ်ႉၶႆႈၼင်ႇ 16:32, 26 မေႇ 2024
Provides some utility functions for manipulating strings.
Functions
export.explode_utf8
function export.explode_utf8(str)
Explodes a string into an array of UTF-8 characters. Warning: this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.
export.pattern_escape
function export.pattern_escape(str)
Escapes the magic characters used in patterns (Lua's version of regular expressions): $%()*+-.?[]^
. For example, "^$()%.[]*+-?"
becomes "%^%$%(%)%%%.%[%]%*%+%-%?"
. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).
export.charset_escape
function export.charset_escape(str)
Escapes only the magic characters used in pattern character sets: %-]^
.
export.replacement_escape
function export.replacement_escape(str)
Escapes only %
, which is the only magic character used in replacement patterns with string.gsub and mw.ustring.gsub.
export.len
function export.len(str)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.sub
function export.sub(str, i, j)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.find
function export.find(str, pattern, init, plain)
A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.
export.match
function export.match(str, pattern, init)
A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.
export.gmatch
function export.gmatch(str, pattern)
A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.
export.gsub
function export.gsub(str, pattern, repl, n)
A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.
export.plain_gsub
function export.plain_gsub(str, pattern, repl, n)
Like gsub, but pattern-matching facilities are turned off, so pattern
and repl
(if a string) are treated as literal.
export.reverse
function export.reverse(str)
Reverses a UTF-8 string; equivalent to string.reverse.
export.char
function export.char(cp, ...)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.codepoint
function export.codepoint(str, i, j)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.gcodepoint
function export.gcodepoint(str, i, j)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.lower
function export.lower(str)
A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.
export.upper
function export.upper(str)
A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.
export.split
function export.split(str, pattern, str_lib, plain)
Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: str_lib
forces use of the string library, while plain
turns any pattern matching facilities off, treating pattern
as literal.
export.gsplit
function export.gsplit(str, pattern, str_lib, plain)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.trim
function export.trim(str, charset)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.decode_entities
function export.decode_entities(str)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.encode_entities
function export.encode_entities(str, charset, str_lib, plain)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.decode_uri
function export.decode_uri(str, enctype)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.remove_comments
function export.remove_comments(str, stage)
Removes any HTML comments from the input text. stage
can be one of three options:
"PRE"
(default) applies the method used by MediaWiki's preprocessor: all<!-- ... -->
pairs are removed, as well as any text after an unclosed<!--
. This is generally suitable when parsing raw template or parser extension tag code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use Module:template parser instead)."POST"
applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any<!-- ... -->
pairs until no more are found (e.g.<!-<!-- ... -->- ... -->
would be fully removed), but any unclosed<!--
is ignored. This is suitable for handling links embedded in template inputs, where the"PRE"
method will have already been applied by the native parser."BOTH"
applies"PRE"
then"POST"
.
export.php_trim
function export.php_trim(str)
Lua equivalent of PHP's trim($string)
, which trims "\0"
, "\t"
, "\n"
, "\v"
, "\r"
and " "
. This is useful when dealing with template parameters, since the native parser trims them like this.
export.scribunto_param_key
function export.scribunto_param_key(key)
Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a frame.args
table). For example, "1"
is normalized to 1
(a number), and " foo "
is normalized to "foo"
. If the input is not a string, it is returned unchanged.
After being trimmed with export.php_trim
, strings are converted to numbers if:
- They are integers, with no decimals (2.0) or leading zeroes (02).
- They are ≤ 253 and ≥ -253.
- For positive values, they do not have a leading
+
sign.
export.escape_bytes
function export.escape_bytes(str)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.format_fun
function export.format_fun(str, fun)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.format
function export.format(str, tbl)
This function, unlike string.format
and mw.ustring.format
, takes just two parameters—a format string and a table—and replaces all instances of {param_name
} in the format string with the table's entry for param_name
. The opening and closing brace characters can be escaped with
and \op
, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
\cl
Examples
string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})
produces
:"one fish, two fish, red fish, blue fish"
string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})
produces
:"The set {1, 2, 3} contains three elements."
- Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.
export.ucfirst
function export.ucfirst(str)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.lcfirst
function export.lcfirst(str)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.capitalize
function export.capitalize(str)
Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.
export.pluralize
function export.pluralize(str)
Pluralize a word in a smart fashion, according to normal English rules.
- If word ends in consonant + -y, replace the -y with -ies.
- If the word ends in -s, -x, -z, -sh, -ch, add -es.
- Otherwise, add -s.
This handles links correctly:
- If a piped link, change the second part appropriately.
- If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
- If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
export.singularize
function export.singularize(str)
Singularize a word in a smart fashion, according to normal English rules. Works analogously to pluralize()
.
NOTE: This doesn't always work as well as pluralize()
. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
- If word ends in -ies, replace -ies with -y.
- If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
- Otherwise, remove -s.
This handles links correctly:
- If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
- If a non-piped link, singularize the link.
- A link like "parishes" will be handled correctly because the code that checks for -shes etc. allows ] characters between the 'sh' etc. and final -es.
export.get_indefinite_article
function export.get_indefinite_article(str, ucfirst)
Return the appropriate indefinite article to prefix to str
. Correctly handles links and capitalized text.
Does not correctly handle words like union, uniform and university that take "a" despite beginning with
a 'u'. The returned article will have its first letter capitalized if ucfirst
is specified, otherwise lowercase.
export.add_indefinite_article
function export.add_indefinite_article(text, ucfirst)
Prefix text
with the appropriate indefinite article to prefix to text
. Correctly handles links and capitalized
text. Does not correctly handle words like union, uniform and university that take "a" despite beginning
with a 'u'. The returned article will have its first letter capitalized if ucfirst
is specified, otherwise lowercase.
local mw = mw
local string = string
local table = table
local ustring = mw.ustring
local byte = string.byte
local char = string.char
local concat = table.concat
local find = string.find
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local len = string.len
local load_data = mw.loadData
local lower = string.lower
local match = string.match
local next = next
local reverse = string.reverse
local select = select
local sort = table.sort
local sub = string.sub
local tonumber = tonumber
local tostring = tostring
local type = type
local ucodepoint = ustring.codepoint
local ufind = ustring.find
local ugcodepoint = ustring.gcodepoint
local ugmatch = ustring.gmatch
local ugsub = ustring.gsub
local ulower = ustring.lower
local umatch = ustring.match
local unpack = unpack
local upper = string.upper
local usub = ustring.sub
local uupper = ustring.upper
-- Defined below.
local charset_escape
local codepoint
local explode_utf8
local format_fun
local get_indefinite_article
local pattern_escape
local pattern_simplifier
local php_trim
local replacement_escape
local u
local ulen
local module_name = "string_utilities"
local export = {}
--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
function export.explode_utf8(str)
local text, i = {}, 0
for ch in gmatch(str, ".[\128-\191]*") do
i = i + 1
text[i] = ch
end
return text
end
explode_utf8 = export.explode_utf8
--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{code|lua|"^$()%.[]*+-?"}} becomes {{code|lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
function export.pattern_escape(str)
return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0"))
end
pattern_escape = export.pattern_escape
--[==[Escapes only the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>.]==]
function export.charset_escape(str)
return (gsub(str, "[%%%-%]^]", "%%%0"))
end
charset_escape = export.charset_escape
--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
function export.replacement_escape(str)
return (gsub(str, "%%", "%%%%"))
end
replacement_escape = export.replacement_escape
do
local function check_sets_equal(set1, set2)
local k2
for k1, v1 in next, set1 do
local v2 = set2[k1]
if v1 ~= v2 and (v2 == nil or not check_sets_equal(v1, v2)) then
return false
end
k2 = next(set2, k2)
end
return next(set2, k2) == nil
end
local function check_sets(bytes)
local key, set1, set = next(bytes)
if set1 == true then
return true
elseif not check_sets(set1) then
return false
end
while true do
key, set = next(bytes, key)
if not key then
return true
elseif not check_sets_equal(set, set1) then
return false
end
end
end
local function make_charset(range)
if #range == 1 then
return char(range[1])
end
sort(range)
local compressed, n, start = {}, 0, range[1]
for i = 1, #range do
local this, nxt = range[i], range[i + 1]
if nxt ~= this + 1 then
n = n + 1
compressed[n] = this == start and char(this) or
char(start) .. "-" .. char(this)
start = nxt
end
end
return "[" .. concat(compressed) .. "]"
end
local function parse_1_byte_charset(pattern, pos)
while true do
local ch, nxt_pos
pos, ch, nxt_pos = match(pattern, "()([%%%]\194-\244][\128-\191]*)()", pos)
if not ch then
return false
elseif ch == "%" then
if match(pattern, "^[acdlpsuwxACDLPSUWXZ\128-\255]", nxt_pos) then
return false
end
pos = pos + 2
elseif ch == "]" then
pos = nxt_pos
return pos
else
return false
end
end
end
--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]
pattern_simplifier = require("Module:fun").memoize(function(pattern)
if type(pattern) == "number" then
return tostring(pattern)
end
local pos, captures, start, n, output = 1, 0, 1, 0
while true do
local ch, nxt_pos
pos, ch, nxt_pos = match(pattern, "()([%%(.[\194-\244][\128-\191]*)()", pos)
if not ch then
break
end
local nxt = sub(pattern, nxt_pos, nxt_pos)
if ch == "%" then
if nxt == "b" then
if not match(pattern, "^()[^\128-\255][^\128-\255]", pos + 2) then
return false
end
pos = pos + 4
elseif nxt == "f" then
pos = pos + 2
if not match(pattern, "^()%[[^^]", pos) then
return false
end
-- Only possible to convert a %f charset which is all
-- ASCII, so use parse_1_byte_charset.
pos = parse_1_byte_charset(pattern, pos)
if not pos then
return false
end
elseif nxt == "Z" then
pos = pos + 2
nxt = sub(pattern, pos, pos)
if nxt == "*" or nxt == "+" or nxt == "-" then
pos = pos + 1
else
output = output or {}
n = n + 1
if nxt == "?" then
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244]?[\128-\191]*"
pos = pos + 1
else
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244][\128-\191]*"
end
start = pos
end
elseif find("acdlpsuwxACDLPSUWX", nxt, 1, true) then
return false
-- Skip the next character if it's ASCII. Otherwise, we will
-- still need to do length checks.
else
pos = pos + (byte(nxt) < 128 and 2 or 1)
end
elseif ch == "(" then
if nxt == ")" or captures == 32 then
return false
end
captures = captures + 1
pos = pos + 1
elseif ch == "." then
if nxt == "*" or nxt == "+" or nxt == "-" then
pos = pos + 2
else
output = output or {}
n = n + 1
if nxt == "?" then
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191]?[\128-\191]*"
pos = pos + 2
else
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191][\128-\191]*"
pos = pos + 1
end
start = pos
end
elseif ch == "[" then
-- Fail negative charsets. TODO: 1-byte charsets should be safe.
if nxt == "^" then
return false
-- If the first character is "%", ch_len is determined by the
-- next one instead.
elseif nxt == "%" then
nxt_pos = nxt_pos + 1
nxt = sub(pattern, nxt_pos, nxt_pos)
end
local ch_len = #match(pattern, "^.[\128-\191]*", nxt_pos)
if ch_len == 1 then -- Single-byte charset.
pos = parse_1_byte_charset(pattern, pos + 1)
if not pos then
return false
end
else -- Multibyte charset.
local charset_pos, bytes = pos
pos = pos + 1
while true do -- TODO: non-ASCII charset ranges.
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
if not ch then
return false
-- If escaped, get the next character. No need to
-- distincguish magic characters or character classes,
-- as they'll all fail for having the wrong length
-- anyway.
elseif ch == "%" then
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
elseif ch == "]" then
pos = nxt_pos
break
end
if ch_len ~= #ch then
return false
end
bytes = bytes or {}
local bytes = bytes
for i = 1, ch_len - 1 do
local b = byte(ch, i, i)
bytes[b] = bytes[b] or {}
bytes = bytes[b]
end
bytes[byte(ch, -1)] = true
pos = nxt_pos
end
if not pos then
return false
end
local nxt = sub(pattern, pos, pos)
if (
(nxt == "?" or nxt == "*" or nxt == "-") or
(nxt == "+" and ch_len > 2) or
not check_sets(bytes)
) then
return false
end
local ranges, b, key, next_byte = {}, 0
repeat
key, next_byte = next(bytes)
local range, n = {key}, 1
-- Loop starts on the second iteration.
for key in next, bytes, key do
n = n + 1
range[n] = key
end
b = b + 1
ranges[b] = range
bytes = next_byte
until next_byte == true
if nxt == "+" then
local range1, range2 = ranges[1], ranges[2]
ranges[1] = make_charset(range1)
ranges[3] = make_charset(range2)
local n = #range2
for i = 1, #range1 do
n = n + 1
range2[n] = range1[i]
end
ranges[2] = make_charset(range2) .. "*"
pos = pos + 1
else
for i = 1, #ranges do
ranges[i] = make_charset(ranges[i])
end
end
output = output or {}
n = n + 1
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges)
start = pos
end
elseif nxt == "+" then
if #ch ~= 2 then
return false
end
output = output or {}
n = n + 1
output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. sub(ch, 2, 2)
pos = nxt_pos + 1
start = pos
elseif nxt == "?" or nxt == "*" or nxt == "-" then
return false
else
pos = nxt_pos
end
end
if start == 1 then
return pattern
end
return concat(output) .. sub(pattern, start)
end, true)
export.pattern_simplifier = pattern_simplifier -- For testing.
end
function export.len(str)
return type(str) == "number" and len(str) or
#str - #gsub(str, "[^\128-\191]+", "")
end
ulen = export.len
function export.sub(str, i, j)
str, i = type(str) == "number" and tostring(str) or str, i or 1
if i < 0 or j and j < 0 then
return usub(str, i, j)
elseif j and i > j or i > #str then
return ""
end
local n, new_i = 0
for loc1, loc2 in gmatch(str, "()[^\128-\191]+()[\128-\191]*") do
n = n + loc2 - loc1
if not new_i and n >= i then
new_i = loc2 - (n - i) - 1
if not j then
return sub(str, new_i)
end
end
if j and n > j then
return sub(str, new_i, loc2 - (n - j) - 1)
end
end
return new_i and sub(str, new_i) or ""
end
do
local function _find(str, loc1, loc2, ...)
if loc1 and not match(str, "^()[^\128-\255]*$") then
-- Use raw values of loc1 and loc2 to get loc1 and the length of the match.
loc1, loc2 = ulen(sub(str, 1, loc1)), ulen(sub(str, loc1, loc2))
-- Offset length with loc1 to get loc2.
loc2 = loc1 + loc2 - 1
end
return loc1, loc2, ...
end
--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
function export.find(str, pattern, init, plain)
init = init or 1
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
return ufind(str, pattern, init, plain)
elseif plain then
return _find(str, find(str, pattern, init, true))
end
local simple = pattern_simplifier(pattern)
if simple then
return _find(str, find(str, simple, init))
end
return ufind(str, pattern, init)
end
end
--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
function export.match(str, pattern, init)
init = init or 1
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
return umatch(str, pattern, init)
end
local simple = pattern_simplifier(pattern)
if simple then
return match(str, simple, init)
end
return umatch(str, pattern, init)
end
--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
function export.gmatch(str, pattern)
local simple = pattern_simplifier(pattern)
if simple then
return gmatch(str, simple)
end
return ugmatch(str, pattern)
end
--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
function export.gsub(str, pattern, repl, n)
local simple = pattern_simplifier(pattern)
if simple then
return gsub(str, simple, repl, n)
end
return ugsub(str, pattern, repl, n)
end
--[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==]
function export.plain_gsub(str, pattern, repl, n)
return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
end
--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]
function export.reverse(str)
return reverse(gsub(str, "[\194-\244][\128-\191]*", reverse))
end
do
local function err(cp)
error("Codepoint " .. cp .. " is out of range: codepoints must be between 0x0 and 0x10FFFF.", 2)
end
local function utf8_char(cp)
cp = tonumber(cp)
if cp < 0 then
err("-0x" .. format("%X", -cp + 1))
elseif cp < 0x80 then
return char(cp)
elseif cp < 0x800 then
return char(
0xC0 + cp / 0x40,
0x80 + cp % 0x40
)
elseif cp < 0x10000 then
if cp >= 0xD800 and cp < 0xE000 then
return "?" -- mw.ustring.char returns "?" for surrogates.
end
return char(
0xE0 + cp / 0x1000,
0x80 + cp / 0x40 % 0x40,
0x80 + cp % 0x40
)
elseif cp < 0x110000 then
return char(
0xF0 + cp / 0x40000,
0x80 + cp / 0x1000 % 0x40,
0x80 + cp / 0x40 % 0x40,
0x80 + cp % 0x40
)
end
err("0x" .. format("%X", cp))
end
function export.char(cp, ...)
if ... == nil then
return utf8_char(cp)
end
local ret = {cp, ...}
for i = 1, select("#", cp, ...) do
ret[i] = utf8_char(ret[i])
end
return concat(ret)
end
u = export.char
end
do
local function get_codepoint(b1, b2, b3, b4)
if b1 < 128 then
return b1, 1
elseif b1 < 224 then
return 0x40 * b1 + b2 - 0x3080, 2
elseif b1 < 240 then
return 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080, 3
end
return 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080, 4
end
function export.codepoint(str, i, j)
if type(str) == "number" then
return byte(str, i, j)
end
i, j = i or 1, j == -1 and #str or i or 1
if i == 1 and j == 1 then
return (get_codepoint(byte(str, 1, 4)))
elseif i < 0 or j < 0 then
return ucodepoint(str, i, j) -- FIXME
end
local n, nb, ret, nr = 0, 1, {}, 0
while n < j do
n = n + 1
if n < i then
local b = byte(str, nb)
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
else
local b1, b2, b3, b4 = byte(str, nb, nb + 3)
if not b1 then
break
end
nr = nr + 1
local add
ret[nr], add = get_codepoint(b1, b2, b3, b4)
nb = nb + add
end
end
return unpack(ret)
end
codepoint = export.codepoint
function export.gcodepoint(str, i, j)
i, j = i or 1, j ~= -1 and j or nil
if i < 0 or j and j < 0 then
return ugcodepoint(str, i, j) -- FIXME
end
local n, nb = 1, 1
while n < i do
local b = byte(str, nb)
if not b then
break
end
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
n = n + 1
end
return function()
if j and n > j then
return nil
end
n = n + 1
local b1, b2, b3, b4 = byte(str, nb, nb + 3)
if not b1 then
return nil
end
local ret, add = get_codepoint(b1, b2, b3, b4)
nb = nb + add
return ret
end
end
end
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
function export.lower(str)
return (match(str, "^()[^\128-\255]*$") and lower or ulower)(str)
end
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
function export.upper(str)
return (match(str, "^()[^\128-\255]*$") and upper or uupper)(str)
end
do
local function add_captures(text, n, ...)
-- Insert any captures from the splitting pattern.
local offset, capture = n - 1, ...
while capture do
n = n + 1
text[n] = capture
capture = select(n - offset, ...)
end
return n
end
local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...)
if not (loc1 and start <= str_len) then
-- If no match, or there is but we're past the end of the string
-- (which happens when the match is the empty string), then add
-- the final chunk and return.
n = n + 1
text[n] = _sub(str, start)
return
elseif loc2 < loc1 then
-- Special case: If we match the empty string, then include the
-- next character; this avoids an infinite loop, and makes
-- splitting by an empty string work the way mw.text.split() does
-- (including non-adjacent empty string matches with %f). If we
-- reach the end of the string this way, return immediately, so we
-- don't get a final empty string. If using the string library, we
-- need to make sure we advance by one UTF-8 character.
if _sub == sub then
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
end
n = n + 1
text[n] = _sub(str, start, loc1)
start = loc1 + 1
if start > str_len then
return ... and add_captures(text, n, ...) or n
end
else
-- Add chunk up to the current match.
n = n + 1
text[n] = _sub(str, start, loc1 - 1)
start = loc2 + 1
end
return (... and add_captures(text, n, ...) or n), start
end
local function _split(str, pattern, str_len, _sub, _find, plain)
local text, n, start = {}, 0, 1
repeat
n, start = iterate(str, str_len, text, n, start, _sub, _find(str, pattern, start, plain))
until not start
return text
end
--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==]
function export.split(str, pattern, str_lib, plain)
if str_lib or plain then
return _split(str, pattern, #str, sub, find, plain)
end
local simple = pattern_simplifier(pattern)
if simple then
return _split(str, simple, #str, sub, find)
end
return _split(str, pattern, ulen(str), usub, ufind)
end
export.capturing_split = export.split -- To be removed.
end
do
-- TODO: merge this with export.split. Not clear how to do this while
-- maintaining the same level of performance, as gsplit is slower.
local function _split(str, pattern, str_len, _sub, _find, plain)
local start, final = 1
local function iter(loc1, loc2, ...)
-- If no match, return the final chunk.
if not loc1 then
final = true
return _sub(str, start)
end
-- Special case: If we match the empty string, then eat the
-- next character; this avoids an infinite loop, and makes
-- splitting by the empty string work the way mw.text.gsplit() does
-- (including non-adjacent empty string matches with %f). If we
-- reach the end of the string this way, set `final` to true, so we
-- don't get stuck matching the empty string at the end.
local chunk
if loc2 < loc1 then
-- If using the string library, we need to make sure we advance
-- by one UTF-8 character.
if _sub == sub then
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
end
chunk = _sub(str, start, loc1)
if loc1 >= str_len then
final = true
else
start = loc1 + 1
end
-- Eat chunk up to the current match.
else
chunk = _sub(str, start, loc1 - 1)
start = loc2 + 1
end
return chunk, ...
end
return function()
if not final then
return iter(_find(str, pattern, start, plain))
end
return nil
end
end
function export.gsplit(str, pattern, str_lib, plain)
if str_lib or plain then
return _split(str, pattern, #str, sub, find, plain)
end
local simple = pattern_simplifier(pattern)
if simple then
return _split(str, simple, #str, sub, find)
end
return _split(str, pattern, ulen(str), usub, ufind)
end
end
function export.trim(str, charset)
if not charset then
return match(str, "^()%s*$") and "" or match(str, "^%s*(.*%S)")
elseif match(charset, "^()[^\128-\255]*$") then
return match(str, "^()[" .. charset .. "]*$") and "" or match(str, "^[" .. charset .. "]*(.*[^" .. charset .. "])")
end
return umatch(str, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$")
end
do
local entities
local function decode_numeric_entity(code, pattern, base)
local cp = match(code, pattern) and tonumber(code, base)
return cp and cp < 0x110000 and u(cp) or nil
end
local function decode_entity(hash, x, code)
if hash == "#" then
return x == "" and decode_numeric_entity(code, "^%d+$") or
decode_numeric_entity(code, "^%x+$", 16)
end
entities = entities or load_data("Module:data/entities")
return entities[x .. code]
end
-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].
function export.decode_entities(str)
return find(str, "&", 1, true) and
gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
end
end
do
local html_entities
local function encode_entity(ch)
local entity = html_entities[ch]
if entity then
return entity
end
entity = "&#" .. codepoint(ch) .. ";"
html_entities[ch] = entity
return entity
end
function export.encode_entities(str, charset, str_lib, plain)
-- Memoized HTML entities (taken from mw.text.lua).
html_entities = html_entities or {
["\""] = """,
["&"] = "&",
["'"] = "'",
["<"] = "<",
[">"] = ">",
["\194\160"] = " ",
}
if not charset then
return (gsub(str, "[\"&'<>\194]\160?", html_entities))
elseif plain then
return (gsub(str, "[" .. charset_escape(charset) .. "]", encode_entity))
elseif str_lib then
if not match(charset, "^()[^\128-\255]*$") then
error("Cannot use the string library with a character set that contains a character with a codepoint above U+007F.")
end
return (gsub(str, "[" .. charset .. "]", encode_entity))
end
local pattern = charset and "[" .. charset .. "]"
local simple = pattern_simplifier(pattern)
if simple then
return (gsub(str, simple, encode_entity))
end
return (ugsub(str, pattern, encode_entity))
end
end
do
local function decode_path(code)
return char(tonumber(code, 16))
end
local function decode(lead, trail)
if lead == "+" or lead == "_" then
return " " .. trail
elseif #trail == 2 then
return decode_path(trail)
end
return lead .. trail
end
function export.decode_uri(str, enctype)
enctype = enctype and upper(enctype) or "QUERY"
if enctype == "PATH" then
return find(str, "%", 1, true) and
gsub(str, "%%(%x%x)", decode_path) or str
elseif enctype == "QUERY" then
return (find(str, "%", 1, true) or find(str, "+", 1, true)) and
gsub(str, "([%%%+])(%x?%x?)", decode) or str
elseif enctype == "WIKI" then
return (find(str, "%", 1, true) or find(str, "_", 1, true)) and
gsub(str, "([%%_])(%x?%x?)", decode) or str
end
error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)
end
end
do
local function _remove_comments(str, pre)
local head = find(str, "<!--", 1, true)
if not head then
return str
end
local ret, n = {sub(str, 1, head - 1)}, 1
while true do
local loc = find(str, "-->", head + 4, true)
if not loc then
return pre and concat(ret) or
concat(ret) .. sub(str, head)
end
head = loc + 3
loc = find(str, "<!--", head, true)
if not loc then
return concat(ret) .. sub(str, head)
end
n = n + 1
ret[n] = sub(str, head, loc - 1)
head = loc
end
end
--[==[Removes any HTML comments from the input text. `stage` can be one of three options:
* {{lua|"PRE"}} (default) applies the method used by MediaWiki's preprocessor: all {{code||<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code||<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
* {{lua|"POST"}} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code||<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code||<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code||<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {{lua|"PRE"}} method will have already been applied by the native parser.
* {{lua|"BOTH"}} applies {{lua|"PRE"}} then {{lua|"POST"}}.]==]
function export.remove_comments(str, stage)
if not stage or stage == "PRE" then
return _remove_comments(str, true)
end
local processed = stage == "POST" and _remove_comments(str) or
stage == "BOTH" and _remove_comments(str, true) or
error("bad argument #2 to \"remove_comments\" (expected PRE, POST, or BOTH)", 2)
while processed ~= str do
str = processed
processed = _remove_comments(str)
end
return str
end
end
--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{code|lua|"\0"}}, {{code|lua|"\t"}}, {{code|lua|"\n"}}, {{code|lua|"\v"}}, {{code|lua|"\r"}} and {{code|lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
function export.php_trim(str)
return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""
end
php_trim = export.php_trim
--[==[Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{code|lua|frame.args}} table). For example, {{code|lua|"1"}} is normalized to {{code|lua|1}} (a number), and {{code|lua|" foo "}} is normalized to {{code|lua|"foo"}}. If the input is not a string, it is returned unchanged.
After being trimmed with {{code|lua|export.php_trim}}, strings are converted to numbers if:
# They are integers, with no decimals (2.0) or leading zeroes (02).
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
# For positive values, they do not have a leading {{code|lua|+}} sign.]==]
function export.scribunto_param_key(key)
if type(key) ~= "string" then
return key
end
key = php_trim(key)
if match(key, "^-?[1-9]%d*$") then
local num = tonumber(key)
-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true.
return (
num <= 9007199254740991 and num >= -9007199254740991 or
key == "9007199254740992" or
key == "-9007199254740992"
) and num or key
elseif key == "0" then
return 0
end
return key
end
do
local byte_escapes
local function escape_byte(b)
return byte_escapes[b] or format("\\%03d", byte(b))
end
function export.escape_bytes(str)
byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes
return (gsub(str, ".", escape_byte))
end
end
function export.format_fun(str, fun)
return (gsub(str, "{(\\?)((\\?)[^{}]*)}", function(p1, name, p2)
if #p1 + #p2 == 1 then
return name == "op" and "{" or
name == "cl" and "}" or
error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
elseif fun(name) and type(fun(name)) ~= "string" then
error(module_name .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
end
return fun(name) or error(module_name .. ".format: \"" .. name .. "\" not found in table")
end))
end
format_fun = export.format_fun
--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
====Examples====
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
return format_fun(str, function(key)
return tbl[key]
end)
end
do
local function do_uclcfirst(str, case_func)
-- Actual function to re-case of the first letter.
local first_letter = case_func(match(str, "^.[\128-\191]*") or "")
return first_letter .. sub(str, #first_letter + 1)
end
local function uclcfirst(str, case_func)
-- If there's a link at the beginning, re-case the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
if link then
return "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
end
return do_uclcfirst(str, case_func)
end
function export.ucfirst(str)
return uclcfirst(str, uupper)
end
function export.lcfirst(str)
return uclcfirst(str, ulower)
end
local function capitalize(w)
return uclcfirst(w, uupper)
end
--[==[Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.]==]
function export.capitalize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Capitalize multi-word that is separated by spaces
-- by uppercasing the first letter of each part.
-- I assume nobody will input all CAP text.
return (ugsub(str, "%S+", capitalize))
end
end
do
local function word_ends_in_consonant_plus_y(str)
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
-- apply to proper nouns, hence "the Gettys", "the public Ivys".
-- We should maybe consider applying this rule here; but it may not
-- be important as this function is almost always called on common nouns
-- (e.g. parts of speech, place types).
return find(str, "[^aeiouAEIOU ]y$")
end
local function word_takes_es_plural(str)
return find(str, "[sxz]$") or find(str, "[cs]h$")
end
local function do_pluralize(str)
if word_ends_in_consonant_plus_y(str) then
-- avoid returning multiple values
return (gsub(str, "y$", "ies"))
elseif word_takes_es_plural(str) then
return str .. "es"
end
return str .. "s"
end
--[==[
Pluralize a word in a smart fashion, according to normal English rules.
# If word ends in consonant + -y, replace the -y with -ies.
# If the word ends in -s, -x, -z, -sh, -ch, add -es.
# Otherwise, add -s.
This handles links correctly:
# If a piped link, change the second part appropriately.
# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
]==]
function export.pluralize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
if not link then
return do_pluralize(str)
elseif linktext ~= "" then
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
elseif word_ends_in_consonant_plus_y(link) then
return beginning .. "[[" .. link .. "|" .. gsub(link, "y$", "ies") .. "]]"
end
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
end
end
do
local function do_singularize(str)
local sing = match(str, "^(.-)ies$")
if sing then
return sing .. "y"
end
-- Handle cases like "[[parish]]es"
return match(str, "^(.-[sc]h%]*)es$") or
-- Handle cases like "[[box]]es"
match(str, "^(.-x%]*)es$") or
-- Handle regular plurals
match(str, "^(.-)s$") or
-- Otherwise, return input
str
end
local function collapse_link(link, linktext)
if link == linktext then
return "[[" .. link .. "]]"
end
return "[[" .. link .. "|" .. linktext .. "]]"
end
--[==[
Singularize a word in a smart fashion, according to normal English rules. Works analogously to {pluralize()}.
'''NOTE''': This doesn't always work as well as {pluralize()}. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
# If word ends in -ies, replace -ies with -y.
# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
# Otherwise, remove -s.
This handles links correctly:
# If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
# If a non-piped link, singularize the link.
# A link like "[[parish]]es" will be handled correctly because the code that checks for -shes etc. allows ] characters between the
'sh' etc. and final -es.
]==]
function export.singularize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
if not link then
return do_singularize(str)
elseif linktext ~= "" then
return beginning .. collapse_link(link, do_singularize(linktext))
end
return beginning .. "[[" .. do_singularize(link) .. "]]"
end
end
--[==[
Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text.
Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with
a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.get_indefinite_article(str, ucfirst)
str = str or ""
local is_vowel = false
-- If there's a link at the beginning, examine the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
if link then
is_vowel = find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
else
is_vowel = find(str, "^[AEIOUaeiou]")
end
return is_vowel and (ucfirst and "An" or "an") or (ucfirst and "A" or "a")
end
get_indefinite_article = export.get_indefinite_article
--[==[
Prefix `text` with the appropriate indefinite article to prefix to `text`. Correctly handles links and capitalized
text. Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning
with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.add_indefinite_article(text, ucfirst)
return get_indefinite_article(text, ucfirst) .. " " .. text
end
return export