မေႃႇၵျူး:string utilities: လွင်ႈပႅၵ်ႇပိူင်ႈ ၼႂ်းၵႄႈ လွင်ႈၶူၼ်ႉၶႆႈ
No edit summary Tag: Reverted |
Sai Myo Thura Kyaw (ဢုပ်ႇဢူဝ်း) ၵေႃႉ ၶိုၼ်ၶိုၼ်း လွင်ႈၶူၼ်ႉၶႆႈ 101075 Tags: Undo Reverted |
||
ထႅဝ် 1: | ထႅဝ် 1: | ||
local mw = mw |
|||
local string = string |
|||
local table = table |
|||
local ustring = mw.ustring |
|||
local byte = string.byte |
|||
local char = string.char |
|||
local concat = table.concat |
|||
local find = string.find |
|||
local format = string.format |
|||
local gmatch = string.gmatch |
|||
local gsub = string.gsub |
|||
local len = string.len |
|||
local load_data = mw.loadData |
|||
local lower = string.lower |
|||
local match = string.match |
|||
local next = next |
|||
local reverse = string.reverse |
|||
local select = select |
|||
local sort = table.sort |
|||
local sub = string.sub |
|||
local tonumber = tonumber |
|||
local tostring = tostring |
|||
local type = type |
|||
local ucodepoint = ustring.codepoint |
|||
local ufind = ustring.find |
|||
local ugcodepoint = ustring.gcodepoint |
|||
local ugmatch = ustring.gmatch |
|||
local ugsub = ustring.gsub |
|||
local ulower = ustring.lower |
|||
local umatch = ustring.match |
|||
local unpack = unpack |
|||
local upper = string.upper |
|||
local usub = ustring.sub |
|||
local uupper = ustring.upper |
|||
-- Defined below. |
|||
local charset_escape |
|||
local codepoint |
|||
local explode_utf8 |
|||
local format_fun |
|||
local get_indefinite_article |
|||
local pattern_escape |
|||
local pattern_simplifier |
|||
local php_trim |
|||
local replacement_escape |
|||
local u |
|||
local ulen |
|||
local module_name = "string_utilities" |
local module_name = "string_utilities" |
||
local export = {} |
local export = {} |
||
local format_escapes = { |
|||
--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==] |
|||
["op"] = "{", |
|||
function export.explode_utf8(str) |
|||
["cl"] = "}", |
|||
} |
|||
for ch in gmatch(str, ".[\128-\191]*") do |
|||
i = i + 1 |
|||
text[i] = ch |
|||
end |
|||
return text |
|||
end |
|||
explode_utf8 = export.explode_utf8 |
|||
function export.format_fun(str, fun) |
|||
--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{code|lua|"^$()%.[]*+-?"}} becomes {{code|lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==] |
|||
return (str:gsub("{(\\?)((\\?)[^{}]*)}", function (p1, name, p2) |
|||
function export.pattern_escape(str) |
|||
if #p1 + #p2 == 1 then |
|||
return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0")) |
|||
return format_escapes[name] or error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'") |
|||
else |
|||
if fun(name) and type(fun(name)) ~= "string" then |
|||
error(module_name .. ".format: '" .. name .. "' is a " .. type(fun(name)) .. ", not a string") |
|||
end |
|||
return fun(name) or error(module_name .. ".format: '" .. name .. "' not found in table") |
|||
end |
|||
end)) |
|||
end |
end |
||
pattern_escape = export.pattern_escape |
|||
--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash. |
|||
--[==[Escapes only the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>.]==] |
|||
====Examples==== |
|||
function export.charset_escape(str) |
|||
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}} |
|||
return (gsub(str, "[%%%-%]^]", "%%%0")) |
|||
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}} |
|||
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}} |
|||
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}} |
|||
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==] |
|||
function export.format(str, tbl) |
|||
return export.format_fun(str, function (key) return tbl[key] end) |
|||
end |
end |
||
charset_escape = export.charset_escape |
|||
--[==[Explodes a string into an array of UTF8 characters. '''Warning''': this function has no safety checks for non-UTF8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==] |
|||
--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==] |
|||
function export. |
function export.explode_utf8(str) |
||
local byte = string.byte |
|||
return (gsub(str, "%%", "%%%%")) |
|||
local sub = string.sub |
|||
end |
|||
replacement_escape = export.replacement_escape |
|||
local str_len = #str |
|||
local text = {} |
|||
do |
|||
local |
local n, i, b = 1, 0 |
||
local k2 |
|||
while n <= str_len do |
|||
for k1, v1 in next, set1 do |
|||
b = byte(str, n) |
|||
local v2 = set2[k1] |
|||
i = i + 1 |
|||
if v1 ~= v2 and (v2 == nil or not check_sets_equal(v1, v2)) then |
|||
if b < 0xC0 then |
|||
return false |
|||
text[i] = sub(str, n, n) |
|||
end |
|||
n = n + 1 |
|||
elseif b < 0xE0 then |
|||
text[i] = sub(str, n, n + 1) |
|||
n = n + 2 |
|||
elseif b < 0xF0 then |
|||
text[i] = sub(str, n, n + 2) |
|||
n = n + 3 |
|||
else |
|||
text[i] = sub(str, n, n + 3) |
|||
n = n + 4 |
|||
end |
end |
||
return next(set2, k2) == nil |
|||
end |
end |
||
return text |
|||
local function check_sets(bytes) |
|||
end |
|||
local key, set1, set = next(bytes) |
|||
if set1 == true then |
|||
-- A helper function which takes a string, position and type ("byte" or "char"), and returns the equivalent position for the other type (e.g. iterate_utf8("字典", 2, "char") returns 4, because character 2 of "字典" begins with byte 4). `pos` can be positive or negative, and the function will iterate over the string forwards or backwards (respectively) until it reaches the input position. Checks byte-by-byte; skipping over trailing bytes, and then calculating the correct byte trail for any leading bytes (i.e. how many trailing bytes should follow); these trailing bytes are then checked together. |
|||
return true |
|||
-- The optional parameters `init_from_type` and `init_to_type` can be used to start part-way through an iteration to improve performance, if multiple values need to be returned from the same string. For example, iterate_utf8("слова́рь", 11, "byte", 5, 3) will begin checking at byte 5/the start of character 3. Note: The function won't check if these values match each other (as the only way to do this would be to run the iteration from the beginning), so mismatched values will return incorrect results. |
|||
elseif not check_sets(set1) then |
|||
local function iterate_utf8(text, pos, from_type, init_from_type, init_to_type) |
|||
return false |
|||
-- Position 0 is always valid and never changes. |
|||
end |
|||
if pos == 0 then |
|||
while true do |
|||
return pos |
|||
key, set = next(bytes, key) |
|||
if not key then |
|||
return true |
|||
elseif not check_sets_equal(set, set1) then |
|||
return false |
|||
end |
|||
end |
|||
end |
end |
||
local to_type |
|||
local function make_charset(range) |
|||
if from_type == "char" then |
|||
to_type = "byte" |
|||
return char(range[1]) |
|||
else |
|||
end |
|||
to_type = "char" |
|||
sort(range) |
|||
local compressed, n, start = {}, 0, range[1] |
|||
for i = 1, #range do |
|||
local this, nxt = range[i], range[i + 1] |
|||
if nxt ~= this + 1 then |
|||
n = n + 1 |
|||
compressed[n] = this == start and char(this) or |
|||
char(start) .. "-" .. char(this) |
|||
start = nxt |
|||
end |
|||
end |
|||
return "[" .. concat(compressed) .. "]" |
|||
end |
end |
||
-- Positive positions iterate forwards; negative positions iterate backwards. |
|||
local function parse_1_byte_charset(pattern, pos) |
|||
local iterate_val |
|||
while true do |
|||
if pos > 0 then |
|||
local ch, nxt_pos |
|||
iterate_val = 1 |
|||
pos, ch, nxt_pos = match(pattern, "()([%%%]\194-\244][\128-\191]*)()", pos) |
|||
else |
|||
if not ch then |
|||
iterate_val = -1 |
|||
return false |
|||
elseif ch == "%" then |
|||
if match(pattern, "^[acdlpsuwxACDLPSUWXZ\128-\255]", nxt_pos) then |
|||
return false |
|||
end |
|||
pos = pos + 2 |
|||
elseif ch == "]" then |
|||
pos = nxt_pos |
|||
return pos |
|||
else |
|||
return false |
|||
end |
|||
end |
|||
end |
end |
||
-- Adjust init_from_type and init_to_type to the iteration before, so that matches for the position given by them will work. |
|||
--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==] |
|||
local trail, cp, min, b = 0 |
|||
pattern_simplifier = require("Module:fun").memoize(function(pattern) |
|||
local c, leading_byte = {} |
|||
if type(pattern) == "number" then |
|||
c[from_type] = init_from_type and init_from_type ~= 0 and init_from_type - iterate_val or 0 |
|||
return tostring(pattern) |
|||
c[to_type] = init_to_type and init_to_type ~= 0 and init_to_type - iterate_val or 0 |
|||
while true do |
|||
if pos > 0 then |
|||
b = text:byte(c.byte + 1) |
|||
else |
|||
b = text:byte(text:len() + c.byte) |
|||
end |
end |
||
-- Position byte doesn't exist, so iterate the return value and return it. |
|||
local pos, captures, start, n, output = 1, 0, 1, 0 |
|||
if not b then |
|||
return c[to_type] + iterate_val |
|||
local ch, nxt_pos |
|||
elseif b < 0x80 then |
|||
pos, ch, nxt_pos = match(pattern, "()([%%(.[\194-\244][\128-\191]*)()", pos) |
|||
-- 1-byte codepoint, 00-7F. |
|||
if not ch then |
|||
trail = 0 |
|||
cp = b |
|||
min = 0 |
|||
leading_byte = true |
|||
elseif b < 0xc0 then |
|||
-- A trailing byte. |
|||
leading_byte = false |
|||
elseif b < 0xc2 then |
|||
-- An overlong encoding for a 1-byte codepoint. |
|||
error("String " .. text .. " is not UTF-8.") |
|||
elseif b < 0xe0 then |
|||
-- 2-byte codepoint, C2-DF. |
|||
trail = 1 |
|||
cp = b - 0xc0 |
|||
min = 0x80 |
|||
leading_byte = true |
|||
elseif b < 0xf0 then |
|||
-- 3-byte codepoint, E0-EF. |
|||
trail = 2 |
|||
cp = b - 0xe0 |
|||
min = 0x800 |
|||
leading_byte = true |
|||
elseif b < 0xf4 then |
|||
-- 4-byte codepoint, F0-F3. |
|||
trail = 3 |
|||
cp = b - 0xf0 |
|||
min = 0x10000 |
|||
leading_byte = true |
|||
elseif b == 0xf4 then |
|||
-- 4-byte codepoint, F4. |
|||
-- Make sure it doesn't decode to over U+10FFFF. |
|||
if text:byte(c.byte + 2) > 0x8f then |
|||
error("String " .. text .. " is not UTF-8.") |
|||
end |
end |
||
trail = 3 |
|||
local nxt = sub(pattern, nxt_pos, nxt_pos) |
|||
cp = 4 |
|||
min = 0x100000 |
|||
leading_byte = true |
|||
if not match(pattern, "^()[^\128-\255][^\128-\255]", pos + 2) then |
|||
else |
|||
return false |
|||
-- Codepoint over U+10FFFF, or invalid byte. |
|||
end |
|||
error("String " .. text .. " is not UTF-8.") |
|||
pos = pos + 4 |
|||
end |
|||
elseif nxt == "f" then |
|||
pos = pos + 2 |
|||
-- Check subsequent bytes for multibyte codepoints. |
|||
if not match(pattern, "^()%[[^^]", pos) then |
|||
if leading_byte then |
|||
return false |
|||
local from, to |
|||
end |
|||
if pos > 0 then |
|||
-- Only possible to convert a %f charset which is all |
|||
from, to = c.byte + 2, c.byte + 1 + trail |
|||
-- ASCII, so use parse_1_byte_charset. |
|||
pos = parse_1_byte_charset(pattern, pos) |
|||
if not pos then |
|||
return false |
|||
end |
|||
elseif nxt == "Z" then |
|||
pos = pos + 2 |
|||
nxt = sub(pattern, pos, pos) |
|||
if nxt == "*" or nxt == "+" or nxt == "-" then |
|||
pos = pos + 1 |
|||
else |
|||
output = output or {} |
|||
n = n + 1 |
|||
if nxt == "?" then |
|||
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244]?[\128-\191]*" |
|||
pos = pos + 1 |
|||
else |
|||
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244][\128-\191]*" |
|||
end |
|||
start = pos |
|||
end |
|||
elseif find("acdlpsuwxACDLPSUWX", nxt, 1, true) then |
|||
return false |
|||
-- Skip the next character if it's ASCII. Otherwise, we will |
|||
-- still need to do length checks. |
|||
else |
|||
pos = pos + (byte(nxt) < 128 and 2 or 1) |
|||
end |
|||
elseif ch == "(" then |
|||
if nxt == ")" or captures == 32 then |
|||
return false |
|||
end |
|||
captures = captures + 1 |
|||
pos = pos + 1 |
|||
elseif ch == "." then |
|||
if nxt == "*" or nxt == "+" or nxt == "-" then |
|||
pos = pos + 2 |
|||
else |
|||
output = output or {} |
|||
n = n + 1 |
|||
if nxt == "?" then |
|||
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191]?[\128-\191]*" |
|||
pos = pos + 2 |
|||
else |
|||
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191][\128-\191]*" |
|||
pos = pos + 1 |
|||
end |
|||
start = pos |
|||
end |
|||
elseif ch == "[" then |
|||
-- Fail negative charsets. TODO: 1-byte charsets should be safe. |
|||
if nxt == "^" then |
|||
return false |
|||
-- If the first character is "%", ch_len is determined by the |
|||
-- next one instead. |
|||
elseif nxt == "%" then |
|||
nxt_pos = nxt_pos + 1 |
|||
nxt = sub(pattern, nxt_pos, nxt_pos) |
|||
end |
|||
local ch_len = #match(pattern, "^.[\128-\191]*", nxt_pos) |
|||
if ch_len == 1 then -- Single-byte charset. |
|||
pos = parse_1_byte_charset(pattern, pos + 1) |
|||
if not pos then |
|||
return false |
|||
end |
|||
else -- Multibyte charset. |
|||
local charset_pos, bytes = pos |
|||
pos = pos + 1 |
|||
while true do -- TODO: non-ASCII charset ranges. |
|||
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos) |
|||
if not ch then |
|||
return false |
|||
-- If escaped, get the next character. No need to |
|||
-- distincguish magic characters or character classes, |
|||
-- as they'll all fail for having the wrong length |
|||
-- anyway. |
|||
elseif ch == "%" then |
|||
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos) |
|||
elseif ch == "]" then |
|||
pos = nxt_pos |
|||
break |
|||
end |
|||
if ch_len ~= #ch then |
|||
return false |
|||
end |
|||
bytes = bytes or {} |
|||
local bytes = bytes |
|||
for i = 1, ch_len - 1 do |
|||
local b = byte(ch, i, i) |
|||
bytes[b] = bytes[b] or {} |
|||
bytes = bytes[b] |
|||
end |
|||
bytes[byte(ch, -1)] = true |
|||
pos = nxt_pos |
|||
end |
|||
if not pos then |
|||
return false |
|||
end |
|||
local nxt = sub(pattern, pos, pos) |
|||
if ( |
|||
(nxt == "?" or nxt == "*" or nxt == "-") or |
|||
(nxt == "+" and ch_len > 2) or |
|||
not check_sets(bytes) |
|||
) then |
|||
return false |
|||
end |
|||
local ranges, b, key, next_byte = {}, 0 |
|||
repeat |
|||
key, next_byte = next(bytes) |
|||
local range, n = {key}, 1 |
|||
-- Loop starts on the second iteration. |
|||
for key in next, bytes, key do |
|||
n = n + 1 |
|||
range[n] = key |
|||
end |
|||
b = b + 1 |
|||
ranges[b] = range |
|||
bytes = next_byte |
|||
until next_byte == true |
|||
if nxt == "+" then |
|||
local range1, range2 = ranges[1], ranges[2] |
|||
ranges[1] = make_charset(range1) |
|||
ranges[3] = make_charset(range2) |
|||
local n = #range2 |
|||
for i = 1, #range1 do |
|||
n = n + 1 |
|||
range2[n] = range1[i] |
|||
end |
|||
ranges[2] = make_charset(range2) .. "*" |
|||
pos = pos + 1 |
|||
else |
|||
for i = 1, #ranges do |
|||
ranges[i] = make_charset(ranges[i]) |
|||
end |
|||
end |
|||
output = output or {} |
|||
n = n + 1 |
|||
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) |
|||
start = pos |
|||
end |
|||
elseif nxt == "+" then |
|||
if #ch ~= 2 then |
|||
return false |
|||
end |
|||
output = output or {} |
|||
n = n + 1 |
|||
output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. sub(ch, 2, 2) |
|||
pos = nxt_pos + 1 |
|||
start = pos |
|||
elseif nxt == "?" or nxt == "*" or nxt == "-" then |
|||
return false |
|||
else |
else |
||
from, to = text:len() + c.byte + 1, text:len() + c.byte + trail |
|||
pos = nxt_pos |
|||
end |
end |
||
for trailing_byte = from, to do |
|||
end |
|||
b = text:byte(trailing_byte) |
|||
if start == 1 then |
|||
if not b or b < 0x80 or b > 0xbf then |
|||
return pattern |
|||
error("String " .. text .. " is not UTF-8.") |
|||
end |
|||
end |
|||
return concat(output) .. sub(pattern, start) |
|||
cp = cp * 0x40 + b - 0x80 |
|||
end, true) |
|||
end |
|||
export.pattern_simplifier = pattern_simplifier -- For testing. |
|||
local next_byte = text:byte(to + 1) |
|||
end |
|||
if next_byte and next_byte >= 0x80 and next_byte <= 0xbf then |
|||
-- Too many trailing bytes. |
|||
function export.len(str) |
|||
error("String " .. text .. " is not UTF-8.") |
|||
return type(str) == "number" and len(str) or |
|||
elseif cp < min then |
|||
#str - #gsub(str, "[^\128-\191]+", "") |
|||
-- Overlong encoding. |
|||
end |
|||
error("String " .. text .. " is not UTF-8.") |
|||
ulen = export.len |
|||
function export.sub(str, i, j) |
|||
str, i = type(str) == "number" and tostring(str) or str, i or 1 |
|||
if i < 0 or j and j < 0 then |
|||
return usub(str, i, j) |
|||
elseif j and i > j or i > #str then |
|||
return "" |
|||
end |
|||
local n, new_i = 0 |
|||
for loc1, loc2 in gmatch(str, "()[^\128-\191]+()[\128-\191]*") do |
|||
n = n + loc2 - loc1 |
|||
if not new_i and n >= i then |
|||
new_i = loc2 - (n - i) - 1 |
|||
if not j then |
|||
return sub(str, new_i) |
|||
end |
end |
||
end |
end |
||
if j and n > j then |
|||
c.byte = c.byte + iterate_val |
|||
return sub(str, new_i, loc2 - (n - j) - 1) |
|||
if leading_byte then |
|||
c.char = c.char + iterate_val |
|||
end |
end |
||
end |
|||
if c[from_type] == pos then |
|||
return new_i and sub(str, new_i) or "" |
|||
return c[to_type] |
|||
end |
|||
do |
|||
local function _find(str, loc1, loc2, ...) |
|||
if loc1 and not match(str, "^()[^\128-\255]*$") then |
|||
-- Use raw values of loc1 and loc2 to get loc1 and the length of the match. |
|||
loc1, loc2 = ulen(sub(str, 1, loc1)), ulen(sub(str, loc1, loc2)) |
|||
-- Offset length with loc1 to get loc2. |
|||
loc2 = loc1 + loc2 - 1 |
|||
end |
end |
||
return loc1, loc2, ... |
|||
end |
|||
--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==] |
|||
function export.find(str, pattern, init, plain) |
|||
init = init or 1 |
|||
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then |
|||
return ufind(str, pattern, init, plain) |
|||
elseif plain then |
|||
return _find(str, find(str, pattern, init, true)) |
|||
end |
|||
local simple = pattern_simplifier(pattern) |
|||
if simple then |
|||
return _find(str, find(str, simple, init)) |
|||
end |
|||
return ufind(str, pattern, init) |
|||
end |
end |
||
end |
end |
||
--[==[Converts a character position to the equivalent byte position.]==] |
|||
--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==] |
|||
function export. |
function export.charsToBytes(text, pos) |
||
return iterate_utf8(text, pos, "char") |
|||
init = init or 1 |
|||
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then |
|||
return umatch(str, pattern, init) |
|||
end |
|||
local simple = pattern_simplifier(pattern) |
|||
if simple then |
|||
return match(str, simple, init) |
|||
end |
|||
return umatch(str, pattern, init) |
|||
end |
end |
||
--[==[Converts a byte position to the equivalent character position.]==] |
|||
--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==] |
|||
function export. |
function export.bytesToChars(text, pos) |
||
local |
local byte = text:byte(pos) |
||
if byte and byte >= 0x80 and byte <= 0xbf then |
|||
if simple then |
|||
error("Byte " .. pos .. " is not a leading byte.") |
|||
return gmatch(str, simple) |
|||
end |
end |
||
return |
return iterate_utf8(text, pos, "byte") |
||
end |
end |
||
-- A helper function which iterates through a pattern, and returns two values: a potentially modified version of the pattern, and a boolean indicating whether the returned pattern is simple (i.e. whether it can be used with the stock string library); if not, then the pattern is complex (i.e. it must be used with the ustring library, which is much more resource-intensive). |
|||
--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==] |
|||
function |
local function patternSimplifier(text, pattern, plain) |
||
pattern = tostring(pattern) |
|||
-- If `plain` is set, then the pattern is treated as literal (so is always simple). Only used by find. |
|||
if simple then |
|||
if plain then |
|||
return gsub(str, simple, repl, n) |
|||
return pattern, true |
|||
--If none of these are present, then the pattern has to be simple. |
|||
elseif not ( |
|||
pattern:match("%[.-[\128-\255].-%]") or |
|||
pattern:match("[\128-\255][%*%+%?%-]") or |
|||
pattern:match("%%[abcdlpsuwxACDLPSUWXZ]") or |
|||
pattern:match("%[%^[^%]]+%]") or |
|||
pattern:match("%.[^%*%+%-]") or |
|||
pattern:match("%.$") or |
|||
pattern:match("%%b.?[\128-\255]") or |
|||
pattern:match("()", 1, true) |
|||
) then |
|||
return pattern, true |
|||
end |
end |
||
-- Otherwise, the pattern could go either way. |
|||
-- Build up the new pattern in a table, then concatenate at the end. we do it this way, as occasionally entries get modified along the way. |
|||
end |
|||
local new_pattern = {} |
|||
local len, pos, b = pattern:len(), 0 |
|||
--[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==] |
|||
local char, next_char |
|||
function export.plain_gsub(str, pattern, repl, n) |
|||
return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n) |
|||
-- `escape` and `balanced` are counters, which ensure the effects of % or %b (respectively) are distributed over the following bytes. |
|||
end |
|||
-- `set` is a boolean that states whether the current byte is in a charset. |
|||
-- `capture` keeps track of how many layers of capture groups the position is in, while `captures` keeps a tally of how many groups have been detected (due to the string library limit of 32). |
|||
--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==] |
|||
local escape, set, balanced, capture, captures = 0, false, 0, 0, 0 |
|||
function export.reverse(str) |
|||
return reverse(gsub(str, "[\194-\244][\128-\191]*", reverse)) |
|||
while pos < len do |
|||
end |
|||
pos = pos + 1 |
|||
b = pattern:byte(pos) |
|||
do |
|||
if escape > 0 then escape = escape - 1 end |
|||
local function err(cp) |
|||
if balanced > 0 then balanced = balanced - 1 end |
|||
error("Codepoint " .. cp .. " is out of range: codepoints must be between 0x0 and 0x10FFFF.", 2) |
|||
char = next_char or pattern:sub(pos, pos) |
|||
end |
|||
next_char = pattern:sub(pos + 1, pos + 1) |
|||
if escape == 0 then |
|||
local function utf8_char(cp) |
|||
if char == "%" then |
|||
cp = tonumber(cp) |
|||
-- Apply % escape. |
|||
if next_char == "." or next_char == "%" or next_char == "[" or next_char == "]" then |
|||
err("-0x" .. format("%X", -cp + 1)) |
|||
escape = 2 |
|||
elseif cp < 0x80 then |
|||
if balanced > 0 then balanced = balanced + 1 end |
|||
return char(cp) |
|||
-- These charsets make the pattern complex. |
|||
elseif cp < 0x800 then |
|||
elseif next_char:match("[acdlpsuwxACDLPSUWXZ]") then |
|||
return char( |
|||
return pattern, false |
|||
0xC0 + cp / 0x40, |
|||
-- This is "%b". |
|||
elseif next_char == "b" then |
|||
) |
|||
balanced = 4 |
|||
elseif cp < 0x10000 then |
|||
end |
|||
if cp >= 0xD800 and cp < 0xE000 then |
|||
-- Enter or leave a charset. |
|||
return "?" -- mw.ustring.char returns "?" for surrogates. |
|||
elseif char == "[" then |
|||
set = true |
|||
elseif char == "]" then |
|||
set = false |
|||
elseif char == "(" then |
|||
capture = capture + 1 |
|||
elseif char == ")" then |
|||
if capture > 0 and set == false and balanced == 0 then |
|||
captures = captures + 1 |
|||
capture = capture - 1 |
|||
end |
|||
end |
end |
||
return char( |
|||
0xE0 + cp / 0x1000, |
|||
0x80 + cp / 0x40 % 0x40, |
|||
0x80 + cp % 0x40 |
|||
) |
|||
elseif cp < 0x110000 then |
|||
return char( |
|||
0xF0 + cp / 0x40000, |
|||
0x80 + cp / 0x1000 % 0x40, |
|||
0x80 + cp / 0x40 % 0x40, |
|||
0x80 + cp % 0x40 |
|||
) |
|||
end |
end |
||
err("0x" .. format("%X", cp)) |
|||
-- Multibyte char. |
|||
end |
|||
if b > 0x7f then |
|||
-- If followed by "*", "+" or "-", then 2-byte chars can be converted into charsets. However, this is not possible with 3 or 4-byte chars, as the charset would be too permissive, because if the trailing bytes were in a different order then this could be a different valid character. |
|||
function export.char(cp, ...) |
|||
if |
if next_char == "*" or next_char == "+" or next_char == "-" then |
||
local prev_pos = pattern:byte(pos - 1) |
|||
return utf8_char(cp) |
|||
if prev_pos > 0xc1 and prev_pos < 0xe0 then |
|||
new_pattern[#new_pattern] = "[" .. new_pattern[#new_pattern] |
|||
table.insert(new_pattern, char .. "]") |
|||
else |
|||
return pattern, false |
|||
end |
|||
-- If in a charset or used in "%b", then the pattern is complex. |
|||
-- If followed by "?", add "?" after each byte. |
|||
elseif next_char == "?" then |
|||
table.insert(new_pattern, char .. "?") |
|||
local check_pos, check_b, i = pos, pattern:byte(pos), #new_pattern |
|||
while check_b and check_b < 0xc0 do |
|||
check_pos = check_pos - 1 |
|||
check_b = pattern:byte(check_pos) |
|||
i = i - 1 |
|||
new_pattern[i] = new_pattern[i] .. "?" |
|||
end |
|||
pos = pos + 1 |
|||
next_char = pattern:sub(pos + 1, pos + 1) |
|||
elseif set or balanced > 0 then |
|||
return pattern, false |
|||
else |
|||
table.insert(new_pattern, char) |
|||
end |
|||
elseif char == "." then |
|||
-- "*", "+", "-" are always okay after ".", as they don't care how many bytes a char has. |
|||
if set or next_char == "*" or next_char == "+" or next_char == "-" or escape > 0 then |
|||
table.insert(new_pattern, char) |
|||
-- If followed by "?", make sure "?" is after the leading byte of the UTF-8 char pattern, then skip forward one. |
|||
elseif next_char == "?" then |
|||
table.insert(new_pattern, "[%z\1-\127\194-\244]?[\128-\191]*") |
|||
pos = pos + 1 |
|||
next_char = pattern:sub(pos + 1, pos + 1) |
|||
-- If used with "%b", pattern is complex. |
|||
elseif balanced > 0 then |
|||
return pattern, false |
|||
-- Otherwise, add the UTF-8 char pattern. |
|||
else |
|||
table.insert(new_pattern, "[%z\1-\127\194-\244][\128-\191]*") |
|||
end |
|||
-- Negative charsets are always complex, unless the text has no UTF-8 chars. |
|||
elseif char == "[" and next_char == "^" and escape == 0 and text:match("[\128-\255]") then |
|||
return pattern, false |
|||
-- "()" matches the position unless escaped or used with "%b", so always necessitates ustring (as we need it to match the char position, not the byte one). |
|||
elseif char == "(" and next_char == ")" and balanced == 0 and escape == 0 and text:match("[\128-\255]") then |
|||
return pattern, false |
|||
else |
|||
table.insert(new_pattern, char) |
|||
end |
end |
||
local ret = {cp, ...} |
|||
for i = 1, select("#", cp, ...) do |
|||
ret[i] = utf8_char(ret[i]) |
|||
end |
|||
return concat(ret) |
|||
end |
end |
||
if captures > 32 then |
|||
u = export.char |
|||
return pattern, false |
|||
else |
|||
pattern = table.concat(new_pattern) |
|||
return pattern, true |
|||
end |
|||
end |
end |
||
--[==[A version of len which uses string.len, but returns the same result as mw.ustring.len.]==] |
|||
do |
|||
function export.len(text) |
|||
text = tostring(text) |
|||
if b1 < 128 then |
|||
local len_bytes = text:len() |
|||
return b1, 1 |
|||
if not text:match("[\128-\255]") then |
|||
elseif b1 < 224 then |
|||
return len_bytes |
|||
else |
|||
elseif b1 < 240 then |
|||
return iterate_utf8(text, len_bytes, "byte") |
|||
end |
|||
return 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080, 4 |
|||
end |
end |
||
end |
|||
--[==[A version of sub which uses string.sub, but returns the same result as mw.ustring.sub.]==] |
|||
function export.codepoint(str, i, j) |
|||
function export.sub(text, i_char, j_char) |
|||
if type(str) == "number" then |
|||
text = tostring(text) |
|||
return byte(str, i, j) |
|||
if not text:match("[\128-\255]") then |
|||
end |
|||
return text:sub(i_char, j_char) |
|||
i, j = i or 1, j == -1 and #str or i or 1 |
|||
if i == 1 and j == 1 then |
|||
return (get_codepoint(byte(str, 1, 4))) |
|||
elseif i < 0 or j < 0 then |
|||
return ucodepoint(str, i, j) -- FIXME |
|||
end |
|||
local n, nb, ret, nr = 0, 1, {}, 0 |
|||
while n < j do |
|||
n = n + 1 |
|||
if n < i then |
|||
local b = byte(str, nb) |
|||
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4) |
|||
else |
|||
local b1, b2, b3, b4 = byte(str, nb, nb + 3) |
|||
if not b1 then |
|||
break |
|||
end |
|||
nr = nr + 1 |
|||
local add |
|||
ret[nr], add = get_codepoint(b1, b2, b3, b4) |
|||
nb = nb + add |
|||
end |
|||
end |
|||
return unpack(ret) |
|||
end |
end |
||
local i_byte, j_byte |
|||
codepoint = export.codepoint |
|||
if j_char then |
|||
if i_char > 0 and j_char > 0 then |
|||
function export.gcodepoint(str, i, j) |
|||
if j_char < i_char then return "" end |
|||
i, j = i or 1, j ~= -1 and j or nil |
|||
i_byte = iterate_utf8(text, i_char, "char") |
|||
if i < 0 or j and j < 0 then |
|||
j_byte = iterate_utf8(text, j_char + 1, "char", i_char, i_byte) - 1 |
|||
return ugcodepoint(str, i, j) -- FIXME |
|||
elseif i_char < 0 and j_char < 0 then |
|||
end |
|||
if j_char < i_char then return "" end |
|||
local n, nb = 1, 1 |
|||
j_byte = iterate_utf8(text, j_char + 1, "char") - 1 |
|||
while n < i do |
|||
i_byte = iterate_utf8(text, i_char, "char", j_char, j_byte) |
|||
local b = byte(str, nb) |
|||
-- For some reason, mw.ustring.sub with i=0, j=0 returns the same result as for i=1, j=1, while string.sub always returns "". However, mw.ustring.sub does return "" with i=1, j=0. As such, we need to adjust j_char to 1 if i_char is either 0, or negative with a magnitude greater than the length of the string. |
|||
if not b then |
|||
elseif j_char == 0 then |
|||
break |
|||
i_byte = iterate_utf8(text, i_char, "char") |
|||
end |
|||
if i_byte == 0 or -i_byte > text:len() then j_char = 1 end |
|||
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4) |
|||
j_byte = iterate_utf8(text, j_char + 1, "char") - 1 |
|||
n = n + 1 |
|||
else |
|||
i_byte = iterate_utf8(text, i_char, "char") |
|||
j_byte = iterate_utf8(text, j_char + 1, "char") - 1 |
|||
return function() |
|||
if j and n > j then |
|||
return nil |
|||
end |
|||
n = n + 1 |
|||
local b1, b2, b3, b4 = byte(str, nb, nb + 3) |
|||
if not b1 then |
|||
return nil |
|||
end |
|||
local ret, add = get_codepoint(b1, b2, b3, b4) |
|||
nb = nb + add |
|||
return ret |
|||
end |
end |
||
else |
|||
i_byte = iterate_utf8(text, i_char, "char") |
|||
end |
end |
||
return text:sub(i_byte, j_byte) |
|||
end |
end |
||
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==] |
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==] |
||
function export.lower( |
function export.lower(text) |
||
text = tostring(text) |
|||
return (match(str, "^()[^\128-\255]*$") and lower or ulower)(str) |
|||
if not text:match("[\128-\255]") then |
|||
return text:lower() |
|||
else |
|||
return mw.ustring.lower(text) |
|||
end |
|||
end |
end |
||
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==] |
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==] |
||
function export.upper( |
function export.upper(text) |
||
text = tostring(text) |
|||
return (match(str, "^()[^\128-\255]*$") and upper or uupper)(str) |
|||
if not text:match("[\128-\255]") then |
|||
return text:upper() |
|||
else |
|||
return mw.ustring.upper(text) |
|||
end |
|||
end |
end |
||
--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==] |
|||
do |
|||
function export.find(text, pattern, init_char, plain) |
|||
text = tostring(text) |
|||
-- Insert any captures from the splitting pattern. |
|||
local simple |
|||
local offset, capture = n - 1, ... |
|||
pattern, simple = patternSimplifier(text, pattern, plain) |
|||
while capture do |
|||
-- If the pattern is simple but multibyte characters are present, then init_char needs to be converted into bytes for string.find to work properly, and the return values need to be converted back into chars. |
|||
n = n + 1 |
|||
if simple then |
|||
text[n] = capture |
|||
if not text:match("[\128-\255]") then |
|||
capture = select(n - offset, ...) |
|||
return text:find(pattern, init_char, plain) |
|||
end |
|||
return n |
|||
end |
|||
local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...) |
|||
if not (loc1 and start <= str_len) then |
|||
-- If no match, or there is but we're past the end of the string |
|||
-- (which happens when the match is the empty string), then add |
|||
-- the final chunk and return. |
|||
n = n + 1 |
|||
text[n] = _sub(str, start) |
|||
return |
|||
elseif loc2 < loc1 then |
|||
-- Special case: If we match the empty string, then include the |
|||
-- next character; this avoids an infinite loop, and makes |
|||
-- splitting by an empty string work the way mw.text.split() does |
|||
-- (including non-adjacent empty string matches with %f). If we |
|||
-- reach the end of the string this way, return immediately, so we |
|||
-- don't get a final empty string. If using the string library, we |
|||
-- need to make sure we advance by one UTF-8 character. |
|||
if _sub == sub then |
|||
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1) |
|||
end |
|||
n = n + 1 |
|||
text[n] = _sub(str, start, loc1) |
|||
start = loc1 + 1 |
|||
if start > str_len then |
|||
return ... and add_captures(text, n, ...) or n |
|||
end |
|||
else |
else |
||
local init_byte = init_char and iterate_utf8(text, init_char, "char") |
|||
-- Add chunk up to the current match. |
|||
local byte1, byte2, c1, c2, c3, c4, c5, c6, c7, c8, c9 = text:find(pattern, init_byte, plain) |
|||
n = n + 1 |
|||
text[n] = _sub(str, start, loc1 - 1) |
|||
-- If string.find returned nil, then return nil. |
|||
start = loc2 + 1 |
|||
if not (byte1 and byte2) then |
|||
end |
|||
return nil |
|||
return (... and add_captures(text, n, ...) or n), start |
|||
end |
|||
local function _split(str, pattern, str_len, _sub, _find, plain) |
|||
local text, n, start = {}, 0, 1 |
|||
repeat |
|||
n, start = iterate(str, str_len, text, n, start, _sub, _find(str, pattern, start, plain)) |
|||
until not start |
|||
return text |
|||
end |
|||
--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==] |
|||
function export.split(str, pattern, str_lib, plain) |
|||
if str_lib or plain then |
|||
return _split(str, pattern, #str, sub, find, plain) |
|||
end |
|||
local simple = pattern_simplifier(pattern) |
|||
if simple then |
|||
return _split(str, simple, #str, sub, find) |
|||
end |
|||
return _split(str, pattern, ulen(str), usub, ufind) |
|||
end |
|||
export.capturing_split = export.split -- To be removed. |
|||
end |
|||
do |
|||
-- TODO: merge this with export.split. Not clear how to do this while |
|||
-- maintaining the same level of performance, as gsplit is slower. |
|||
local function _split(str, pattern, str_len, _sub, _find, plain) |
|||
local start, final = 1 |
|||
local function iter(loc1, loc2, ...) |
|||
-- If no match, return the final chunk. |
|||
if not loc1 then |
|||
final = true |
|||
return _sub(str, start) |
|||
end |
end |
||
-- Special case: If we match the empty string, then eat the |
|||
-- Get first return value. If we have a positive init_char, we can save resources by resuming at that point. |
|||
-- next character; this avoids an infinite loop, and makes |
|||
local char1, char2 |
|||
-- splitting by the empty string work the way mw.text.gsplit() does |
|||
if (not init_char) or init_char > 0 then |
|||
-- (including non-adjacent empty string matches with %f). If we |
|||
char1 = iterate_utf8(text, byte1, "byte", init_byte, init_char) |
|||
-- reach the end of the string this way, set `final` to true, so we |
|||
-- don't get stuck matching the empty string at the end. |
|||
local chunk |
|||
if loc2 < loc1 then |
|||
-- If using the string library, we need to make sure we advance |
|||
-- by one UTF-8 character. |
|||
if _sub == sub then |
|||
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1) |
|||
end |
|||
chunk = _sub(str, start, loc1) |
|||
if loc1 >= str_len then |
|||
final = true |
|||
else |
|||
start = loc1 + 1 |
|||
end |
|||
-- Eat chunk up to the current match. |
|||
else |
else |
||
char1 = iterate_utf8(text, byte1, "byte") |
|||
start = loc2 + 1 |
|||
end |
end |
||
return chunk, ... |
|||
-- If byte1 and byte2 are the same, don't bother running iterate_utf8 twice. Otherwise, resume iterate_utf8 from byte1 to find char2. |
|||
end |
|||
if byte1 == byte2 then |
|||
char2 = char1 |
|||
return function() |
|||
else |
|||
if not final then |
|||
char2 = iterate_utf8(text, byte2, "byte", byte1, char1) |
|||
end |
end |
||
return unpack{char1, char2, c1, c2, c3, c4, c5, c6, c7, c8, c9} |
|||
end |
end |
||
else |
|||
end |
|||
return mw.ustring.find(text, pattern, init_char, plain) |
|||
function export.gsplit(str, pattern, str_lib, plain) |
|||
if str_lib or plain then |
|||
return _split(str, pattern, #str, sub, find, plain) |
|||
end |
|||
local simple = pattern_simplifier(pattern) |
|||
if simple then |
|||
return _split(str, simple, #str, sub, find) |
|||
end |
|||
return _split(str, pattern, ulen(str), usub, ufind) |
|||
end |
end |
||
end |
end |
||
--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==] |
|||
function export.trim(str, charset) |
|||
function export.match(text, pattern, init) |
|||
if not charset then |
|||
text = tostring(text) |
|||
return match(str, "^()%s*$") and "" or match(str, "^%s*(.*%S)") |
|||
local simple |
|||
elseif match(charset, "^()[^\128-\255]*$") then |
|||
pattern, simple = patternSimplifier(text, pattern) |
|||
return match(str, "^()[" .. charset .. "]*$") and "" or match(str, "^[" .. charset .. "]*(.*[^" .. charset .. "])") |
|||
if simple then |
|||
if init and text:find("[\128-\255]") then |
|||
init = iterate_utf8(text, init, "char") |
|||
end |
|||
return text:match(pattern, init) |
|||
else |
|||
return mw.ustring.match(text, pattern, init) |
|||
end |
end |
||
return umatch(str, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$") |
|||
end |
end |
||
--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==] |
|||
do |
|||
function export.gmatch(text, pattern) |
|||
local entities |
|||
text = tostring(text) |
|||
local simple |
|||
local function decode_numeric_entity(code, pattern, base) |
|||
pattern, simple = patternSimplifier(text, pattern) |
|||
local cp = match(code, pattern) and tonumber(code, base) |
|||
if simple then |
|||
return cp and cp < 0x110000 and u(cp) or nil |
|||
return text:gmatch(pattern) |
|||
end |
|||
else |
|||
return mw.ustring.gmatch(text, pattern) |
|||
local function decode_entity(hash, x, code) |
|||
if hash == "#" then |
|||
return x == "" and decode_numeric_entity(code, "^%d+$") or |
|||
decode_numeric_entity(code, "^%x+$", 16) |
|||
end |
|||
entities = entities or load_data("Module:data/entities") |
|||
return entities[x .. code] |
|||
end |
|||
-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]]. |
|||
function export.decode_entities(str) |
|||
return find(str, "&", 1, true) and |
|||
gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str |
|||
end |
end |
||
end |
end |
||
--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==] |
|||
do |
|||
function export.gsub(text, pattern, repl, n) |
|||
local html_entities |
|||
text = tostring(text) |
|||
local |
local simple |
||
pattern, simple = patternSimplifier(text, pattern) |
|||
local entity = html_entities[ch] |
|||
if simple then |
|||
return text:gsub(pattern, repl, n) |
|||
else |
|||
end |
|||
return mw.ustring.gsub(text, pattern, repl, n) |
|||
entity = "&#" .. codepoint(ch) .. ";" |
|||
html_entities[ch] = entity |
|||
return entity |
|||
end |
|||
function export.encode_entities(str, charset, str_lib, plain) |
|||
-- Memoized HTML entities (taken from mw.text.lua). |
|||
html_entities = html_entities or { |
|||
["\""] = """, |
|||
["&"] = "&", |
|||
["'"] = "'", |
|||
["<"] = "<", |
|||
[">"] = ">", |
|||
["\194\160"] = " ", |
|||
} |
|||
if not charset then |
|||
return (gsub(str, "[\"&'<>\194]\160?", html_entities)) |
|||
elseif plain then |
|||
return (gsub(str, "[" .. charset_escape(charset) .. "]", encode_entity)) |
|||
elseif str_lib then |
|||
if not match(charset, "^()[^\128-\255]*$") then |
|||
error("Cannot use the string library with a character set that contains a character with a codepoint above U+007F.") |
|||
end |
|||
return (gsub(str, "[" .. charset .. "]", encode_entity)) |
|||
end |
|||
local pattern = charset and "[" .. charset .. "]" |
|||
local simple = pattern_simplifier(pattern) |
|||
if simple then |
|||
return (gsub(str, simple, encode_entity)) |
|||
end |
|||
return (ugsub(str, pattern, encode_entity)) |
|||
end |
end |
||
end |
end |
||
--[==[ |
|||
do |
|||
-- Reimplementation of mw.ustring.split() that includes any capturing |
|||
local function decode_path(code) |
|||
-- groups in the splitting pattern. This works like Python's re.split() |
|||
return char(tonumber(code, 16)) |
|||
-- function, except that it has Lua's behavior when the split pattern |
|||
end |
|||
-- is empty (i.e. advancing by one character at a time; Python returns the |
|||
-- whole remainder of the string). |
|||
local function decode(lead, trail) |
|||
]==] |
|||
if lead == "+" or lead == "_" then |
|||
function export.capturing_split(str, pattern) |
|||
return " " .. trail |
|||
local ret = {} |
|||
elseif #trail == 2 then |
|||
-- (.-) corresponds to (.*?) in Python or Perl; () captures the |
|||
return decode_path(trail) |
|||
-- current position after matching. |
|||
pattern = "(.-)" .. pattern .. "()" |
|||
local start = 1 |
|||
while true do |
|||
-- Did we reach the end of the string? |
|||
if start > #str then |
|||
table.insert(ret, "") |
|||
return ret |
|||
end |
end |
||
-- match() returns all captures as multiple return values; |
|||
return lead .. trail |
|||
-- we need to insert into a table to get them all. |
|||
end |
|||
local captures = {export.match(str, pattern, start)} |
|||
-- If no match, add the remainder of the string. |
|||
function export.decode_uri(str, enctype) |
|||
if #captures == 0 then |
|||
enctype = enctype and upper(enctype) or "QUERY" |
|||
table.insert(ret, export.sub(str, start)) |
|||
if enctype == "PATH" then |
|||
return |
return ret |
||
gsub(str, "%%(%x%x)", decode_path) or str |
|||
elseif enctype == "QUERY" then |
|||
return (find(str, "%", 1, true) or find(str, "+", 1, true)) and |
|||
gsub(str, "([%%%+])(%x?%x?)", decode) or str |
|||
elseif enctype == "WIKI" then |
|||
return (find(str, "%", 1, true) or find(str, "_", 1, true)) and |
|||
gsub(str, "([%%_])(%x?%x?)", decode) or str |
|||
end |
|||
error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2) |
|||
end |
|||
end |
|||
do |
|||
local function _remove_comments(str, pre) |
|||
local head = find(str, "<!--", 1, true) |
|||
if not head then |
|||
return str |
|||
end |
end |
||
local |
local newstart = table.remove(captures) |
||
-- Special case: If we don't advance by any characters, then advance |
|||
while true do |
|||
-- by one character; this avoids an infinite loop, and makes splitting |
|||
local loc = find(str, "-->", head + 4, true) |
|||
-- by an empty string work the way mw.ustring.split() does. If we |
|||
if not loc then |
|||
-- reach the end of the string this way, return immediately, so we |
|||
return pre and concat(ret) or |
|||
-- don't get a final empty string. |
|||
concat(ret) .. sub(str, head) |
|||
if newstart == start then |
|||
table.insert(ret, export.sub(str, start, start)) |
|||
table.remove(captures, 1) |
|||
start = start + 1 |
|||
if start > #str then |
|||
return ret |
|||
end |
end |
||
else |
|||
head = loc + 3 |
|||
table.insert(ret, table.remove(captures, 1)) |
|||
loc = find(str, "<!--", head, true) |
|||
start = newstart |
|||
return concat(ret) .. sub(str, head) |
|||
end |
|||
n = n + 1 |
|||
ret[n] = sub(str, head, loc - 1) |
|||
head = loc |
|||
end |
end |
||
-- Insert any captures from the splitting pattern. |
|||
end |
|||
for _, x in ipairs(captures) do |
|||
table.insert(ret, x) |
|||
--[==[Removes any HTML comments from the input text. `stage` can be one of three options: |
|||
* {{lua|"PRE"}} (default) applies the method used by MediaWiki's preprocessor: all {{code||<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code||<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead). |
|||
* {{lua|"POST"}} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code||<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code||<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code||<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {{lua|"PRE"}} method will have already been applied by the native parser. |
|||
* {{lua|"BOTH"}} applies {{lua|"PRE"}} then {{lua|"POST"}}.]==] |
|||
function export.remove_comments(str, stage) |
|||
if not stage or stage == "PRE" then |
|||
return _remove_comments(str, true) |
|||
end |
|||
local processed = stage == "POST" and _remove_comments(str) or |
|||
stage == "BOTH" and _remove_comments(str, true) or |
|||
error("bad argument #2 to \"remove_comments\" (expected PRE, POST, or BOTH)", 2) |
|||
while processed ~= str do |
|||
str = processed |
|||
processed = _remove_comments(str) |
|||
end |
end |
||
return str |
|||
end |
end |
||
end |
end |
||
local function uclcfirst(text, dolower) |
|||
--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{code|lua|"\0"}}, {{code|lua|"\t"}}, {{code|lua|"\n"}}, {{code|lua|"\v"}}, {{code|lua|"\r"}} and {{code|lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==] |
|||
function |
local function douclcfirst(text) |
||
-- Actual function to re-case of the first letter. |
|||
return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or "" |
|||
local first_letter = export.sub(text, 1, 1) |
|||
end |
|||
first_letter = dolower and export.lower(first_letter) or export.upper(first_letter) |
|||
php_trim = export.php_trim |
|||
return first_letter .. export.sub(text, 2) |
|||
--[==[Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{code|lua|frame.args}} table). For example, {{code|lua|"1"}} is normalized to {{code|lua|1}} (a number), and {{code|lua|" foo "}} is normalized to {{code|lua|"foo"}}. If the input is not a string, it is returned unchanged. |
|||
After being trimmed with {{code|lua|export.php_trim}}, strings are converted to numbers if: |
|||
# They are integers, with no decimals (2.0) or leading zeroes (02). |
|||
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}. |
|||
# For positive values, they do not have a leading {{code|lua|+}} sign.]==] |
|||
function export.scribunto_param_key(key) |
|||
if type(key) ~= "string" then |
|||
return key |
|||
end |
end |
||
-- If there's a link at the beginning, re-case the first letter of the |
|||
key = php_trim(key) |
|||
-- link text. This pattern matches both piped and unpiped links. |
|||
if match(key, "^-?[1-9]%d*$") then |
|||
-- If the link is not piped, the second capture (linktext) will be empty. |
|||
local num = tonumber(key) |
|||
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$") |
|||
-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true. |
|||
if link then |
|||
return ( |
|||
return "[[" .. link .. "|" .. douclcfirst(linktext ~= "" and linktext or link) .. "]]" .. remainder |
|||
num <= 9007199254740991 and num >= -9007199254740991 or |
|||
key == "9007199254740992" or |
|||
key == "-9007199254740992" |
|||
) and num or key |
|||
elseif key == "0" then |
|||
return 0 |
|||
end |
end |
||
return |
return douclcfirst(text) |
||
end |
end |
||
function export.ucfirst(text) |
|||
do |
|||
return uclcfirst(text, false) |
|||
local byte_escapes |
|||
local function escape_byte(b) |
|||
return byte_escapes[b] or format("\\%03d", byte(b)) |
|||
end |
|||
function export.escape_bytes(str) |
|||
byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes |
|||
return (gsub(str, ".", escape_byte)) |
|||
end |
|||
end |
end |
||
function export. |
function export.lcfirst(text) |
||
return uclcfirst(text, true) |
|||
return (gsub(str, "{(\\?)((\\?)[^{}]*)}", function(p1, name, p2) |
|||
if #p1 + #p2 == 1 then |
|||
return name == "op" and "{" or |
|||
name == "cl" and "}" or |
|||
error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'") |
|||
elseif fun(name) and type(fun(name)) ~= "string" then |
|||
error(module_name .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string") |
|||
end |
|||
return fun(name) or error(module_name .. ".format: \"" .. name .. "\" not found in table") |
|||
end)) |
|||
end |
|||
format_fun = export.format_fun |
|||
--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash. |
|||
====Examples==== |
|||
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}} |
|||
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}} |
|||
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}} |
|||
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}} |
|||
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==] |
|||
function export.format(str, tbl) |
|||
return format_fun(str, function(key) |
|||
return tbl[key] |
|||
end) |
|||
end |
end |
||
-- Faster version of mw.text.nowiki, with minor changes to match the PHP equivalent: ";" always escapes, and colons in unslashed protocols only escape after regex \b. |
|||
do |
do |
||
local function |
local function escape_char(str1, str2) |
||
if str2 then |
|||
-- Actual function to re-case of the first letter. |
|||
return str1 .. "&#" .. str2:byte() .. ";" |
|||
local first_letter = case_func(match(str, "^.[\128-\191]*") or "") |
|||
end |
|||
return first_letter .. sub(str, #first_letter + 1) |
|||
return "&#" .. str1:byte() .. ";" |
|||
end |
end |
||
local function |
local function escape_uri(uri) |
||
local uri_schemes = mw.loadData("Module:string utilities/data").uri_schemes |
|||
-- If there's a link at the beginning, re-case the first letter of the |
|||
return uri_schemes[uri:lower()] and uri .. ":" or uri .. ":" |
|||
-- link text. This pattern matches both piped and unpiped links. |
|||
-- If the link is not piped, the second capture (linktext) will be empty. |
|||
local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$") |
|||
if link then |
|||
return "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder |
|||
end |
|||
return do_uclcfirst(str, case_func) |
|||
end |
end |
||
function export. |
function export.nowiki(text) |
||
return |
return (text |
||
:gsub("[\"&';<=>%[%]{|}]", escape_char) |
|||
:gsub("^[\t\n\r #%*:]", escape_char) |
|||
:gsub("([\n\r])([\t\n\r #%*:])", escape_char) |
|||
:gsub("%f[^%z\r\n]%-(%-%-%-)", "-%1") |
|||
:gsub("__", "__") |
|||
:gsub("://", "://") |
|||
:gsub("(ISBN)(%s)", escape_char) |
|||
:gsub("(PMID)(%s)", escape_char) |
|||
:gsub("(RFC)(%s)", escape_char) |
|||
:gsub("([%w_]+):", escape_uri)) |
|||
end |
end |
||
end |
|||
function export.capitalize(text) |
|||
if type(text) == "table" then |
|||
return uclcfirst(str, ulower) |
|||
-- allow calling from a template |
|||
text = text.args[1] |
|||
end |
end |
||
-- Capitalize multi-word that is separated by spaces |
|||
-- by uppercasing the first letter of each part. |
|||
local function capitalize(w) |
|||
-- I assume nobody will input all CAP text. |
|||
return uclcfirst(w, uupper) |
|||
w2 = {} |
|||
end |
|||
for w in export.gmatch(text, "%S+") do |
|||
table.insert(w2, uclcfirst(w, false)) |
|||
--[==[Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.]==] |
|||
function export.capitalize(str) |
|||
if type(str) == "table" then |
|||
-- allow calling from a template |
|||
str = str.args[1] |
|||
end |
|||
-- Capitalize multi-word that is separated by spaces |
|||
-- by uppercasing the first letter of each part. |
|||
-- I assume nobody will input all CAP text. |
|||
return (ugsub(str, "%S+", capitalize)) |
|||
end |
end |
||
return table.concat(w2, " ") |
|||
end |
end |
||
function export.pluralize(text) |
|||
do |
|||
if type(text) == "table" then |
|||
local function word_ends_in_consonant_plus_y(str) |
|||
-- allow calling from a template |
|||
text = text.args[1] |
|||
end |
|||
-- Pluralize a word in a smart fashion, according to normal English rules. |
|||
-- 1. If word ends in consonant + -y, replace the -y with -ies. |
|||
-- 2. If the word ends in -s, -x, -z, -sh, -ch, add -es. |
|||
-- 3. Otherwise, add -s. |
|||
-- This handles links correctly: |
|||
-- 1. If a piped link, change the second part appropriately. |
|||
-- 2. If a non-piped link and rule #1 above applies, convert to a piped link |
|||
-- with the second part containing the plural. |
|||
-- 3. If a non-piped link and rules #2 or #3 above apply, add the plural |
|||
-- outside the link. |
|||
local function word_ends_in_consonant_plus_y(text) |
|||
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't |
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't |
||
-- apply to proper nouns, hence "the Gettys", "the public Ivys". |
-- apply to proper nouns, hence "the Gettys", "the public Ivys". |
||
ထႅဝ် 964: | ထႅဝ် 605: | ||
-- be important as this function is almost always called on common nouns |
-- be important as this function is almost always called on common nouns |
||
-- (e.g. parts of speech, place types). |
-- (e.g. parts of speech, place types). |
||
return find( |
return text:find("[^aeiouAEIOU ]y$") |
||
end |
end |
||
local function word_takes_es_plural( |
local function word_takes_es_plural(text) |
||
return find( |
return text:find("[sxz]$") or text:find("[cs]h$") |
||
end |
end |
||
local function do_pluralize( |
local function do_pluralize(text) |
||
if word_ends_in_consonant_plus_y( |
if word_ends_in_consonant_plus_y(text) then |
||
-- avoid returning multiple values |
-- avoid returning multiple values |
||
local hack_single_retval = text:gsub("y$", "ies") |
|||
return hack_single_retval |
|||
elseif word_takes_es_plural(str) then |
|||
elseif word_takes_es_plural(text) then |
|||
return str .. "es" |
|||
return text .. "es" |
|||
else |
|||
return text .. "s" |
|||
end |
end |
||
end |
|||
return str .. "s" |
|||
end |
|||
-- Check for a link. This pattern matches both piped and unpiped links. |
|||
-- If the link is not piped, the second capture (linktext) will be empty. |
|||
--[==[ |
|||
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$") |
|||
Pluralize a word in a smart fashion, according to normal English rules. |
|||
if link then |
|||
# If word ends in consonant + -y, replace the -y with -ies. |
|||
if linktext ~= "" then |
|||
# If the word ends in -s, -x, -z, -sh, -ch, add -es. |
|||
# Otherwise, add -s. |
|||
This handles links correctly: |
|||
# If a piped link, change the second part appropriately. |
|||
# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural. |
|||
# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link. |
|||
]==] |
|||
function export.pluralize(str) |
|||
if type(str) == "table" then |
|||
-- allow calling from a template |
|||
str = str.args[1] |
|||
end |
|||
-- Check for a link. This pattern matches both piped and unpiped links. |
|||
-- If the link is not piped, the second capture (linktext) will be empty. |
|||
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$") |
|||
if not link then |
|||
return do_pluralize(str) |
|||
elseif linktext ~= "" then |
|||
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]" |
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]" |
||
end |
|||
elseif word_ends_in_consonant_plus_y(link) then |
|||
if word_ends_in_consonant_plus_y(link) then |
|||
return beginning .. "[[" .. link .. "|" .. gsub(link, "y$", "ies") .. "]]" |
|||
return beginning .. "[[" .. link .. "|" .. link:gsub("y$", "ies") .. "]]" |
|||
end |
end |
||
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s") |
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s") |
||
end |
end |
||
return do_pluralize(text) |
|||
end |
end |
||
function export.singularize(text) |
|||
do |
|||
if type(text) == "table" then |
|||
local function do_singularize(str) |
|||
-- allow calling from a template |
|||
local sing = match(str, "^(.-)ies$") |
|||
text = text.args[1] |
|||
end |
|||
-- Singularize a word in a smart fashion, according to normal English rules. |
|||
-- Works analogously to pluralize(). |
|||
-- NOTE: This doesn't always work as well as pluralize(). Beware. It will |
|||
-- mishandle cases like "passes" -> "passe", "eyries" -> "eyry". |
|||
-- 1. If word ends in -ies, replace -ies with -y. |
|||
-- 2. If the word ends in -xes, -shes, -ches, remove -es. [Does not affect |
|||
-- -ses, cf. "houses", "impasses".] |
|||
-- 3. Otherwise, remove -s. |
|||
-- This handles links correctly: |
|||
-- 1. If a piped link, change the second part appropriately. Collapse the |
|||
-- link to a simple link if both parts end up the same. |
|||
-- 2. If a non-piped link, singularize the link. |
|||
-- 3. A link like "[[parish]]es" will be handled correctly because the |
|||
-- code that checks for -shes etc. allows ] characters between the |
|||
-- 'sh' etc. and final -es. |
|||
local function do_singularize(text) |
|||
local sing = text:match("^(.-)ies$") |
|||
if sing then |
if sing then |
||
return sing .. "y" |
return sing .. "y" |
||
end |
end |
||
-- Handle cases like "[[parish]]es" |
-- Handle cases like "[[parish]]es" |
||
local sing = text:match("^(.-[sc]h%]*)es$") |
|||
if sing then |
|||
return sing |
|||
end |
|||
-- Handle cases like "[[box]]es" |
-- Handle cases like "[[box]]es" |
||
local sing = text:match("^(.-x%]*)es$") |
|||
if sing then |
|||
-- Handle regular plurals |
|||
return sing |
|||
match(str, "^(.-)s$") or |
|||
end |
|||
-- Otherwise, return input |
|||
local sing = text:match("^(.-)s$") |
|||
str |
|||
if sing then |
|||
return sing |
|||
end |
|||
return text |
|||
end |
end |
||
local function collapse_link(link, linktext) |
local function collapse_link(link, linktext) |
||
if link == linktext then |
if link == linktext then |
||
return "[[" .. link .. "]]" |
return "[[" .. link .. "]]" |
||
else |
|||
return "[[" .. link .. "|" .. linktext .. "]]" |
|||
end |
end |
||
return "[[" .. link .. "|" .. linktext .. "]]" |
|||
end |
end |
||
--[==[ |
|||
Singularize a word in a smart fashion, according to normal English rules. Works analogously to {pluralize()}. |
|||
-- Check for a link. This pattern matches both piped and unpiped links. |
|||
'''NOTE''': This doesn't always work as well as {pluralize()}. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry". |
|||
-- If the link is not piped, the second capture (linktext) will be empty. |
|||
# If word ends in -ies, replace -ies with -y. |
|||
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$") |
|||
# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".] |
|||
if link then |
|||
# Otherwise, remove -s. |
|||
if linktext ~= "" then |
|||
This handles links correctly: |
|||
# If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same. |
|||
# If a non-piped link, singularize the link. |
|||
# A link like "[[parish]]es" will be handled correctly because the code that checks for -shes etc. allows ] characters between the |
|||
'sh' etc. and final -es. |
|||
]==] |
|||
function export.singularize(str) |
|||
if type(str) == "table" then |
|||
-- allow calling from a template |
|||
str = str.args[1] |
|||
end |
|||
-- Check for a link. This pattern matches both piped and unpiped links. |
|||
-- If the link is not piped, the second capture (linktext) will be empty. |
|||
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$") |
|||
if not link then |
|||
return do_singularize(str) |
|||
elseif linktext ~= "" then |
|||
return beginning .. collapse_link(link, do_singularize(linktext)) |
return beginning .. collapse_link(link, do_singularize(linktext)) |
||
end |
end |
||
return beginning .. "[[" .. do_singularize(link) .. "]]" |
return beginning .. "[[" .. do_singularize(link) .. "]]" |
||
end |
end |
||
return do_singularize(text) |
|||
end |
end |
||
--[==[ |
|||
function export.add_indefinite_article(text, uppercase) |
|||
Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text. |
|||
Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with |
|||
a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase. |
|||
]==] |
|||
function export.get_indefinite_article(str, ucfirst) |
|||
str = str or "" |
|||
local is_vowel = false |
local is_vowel = false |
||
-- If there's a link at the beginning, examine the first letter of the |
-- If there's a link at the beginning, examine the first letter of the |
||
-- link text. This pattern matches both piped and unpiped links. |
-- link text. This pattern matches both piped and unpiped links. |
||
-- If the link is not piped, the second capture (linktext) will be empty. |
-- If the link is not piped, the second capture (linktext) will be empty. |
||
local link, linktext = match( |
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$") |
||
if link then |
if link then |
||
is_vowel = find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]") |
is_vowel = export.find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]") |
||
else |
else |
||
is_vowel = find( |
is_vowel = export.find(text, "^[AEIOUaeiou]") |
||
end |
end |
||
return is_vowel and ( |
return (is_vowel and (uppercase and "An " or "an ") or (uppercase and "A " or "a ")) .. text |
||
end |
end |
||
get_indefinite_article = export.get_indefinite_article |
|||
-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.). |
|||
--[==[ |
|||
function export.escape_risky_characters(text) |
|||
Prefix `text` with the appropriate indefinite article to prefix to `text`. Correctly handles links and capitalized |
|||
-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software. |
|||
text. Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning |
|||
if not mw.ustring.match(text, "%S") then |
|||
with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase. |
|||
return mw.text.encode(text, "%s") |
|||
]==] |
|||
else |
|||
function export.add_indefinite_article(text, ucfirst) |
|||
return mw.text.encode(text, "!#%%&*+/:;<=>?@[\\%]_{|}") |
|||
return get_indefinite_article(text, ucfirst) .. " " .. text |
|||
end |
|||
end |
end |
||
ၶိုၼ်းၶူၼ်ႉၶႆႈၼင်ႇ 16:33, 26 မေႇ 2024
Provides some utility functions for manipulating strings.
Functions
export.format_fun
function export.format_fun(str, fun)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.format
function export.format(str, tbl)
This function, unlike Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value). and Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value)., takes just two parameters—a format string and a table—and replaces all instances of Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value).} in the format string with the table's entry for Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value).. The opening and closing brace characters can be escaped with
and \op
, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
\cl
Examples
- Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value).
produces
: Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value).
- Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value).
produces
: Lua error in မေႃႇၵျူး:parameters at line 195: attempt to call upvalue 'scribunto_param_key' (a nil value).- Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.
export.explode_utf8
function export.explode_utf8(str)
Explodes a string into an array of UTF8 characters. Warning: this function has no safety checks for non-UTF8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.
export.charsToBytes
function export.charsToBytes(text, pos)
Converts a character position to the equivalent byte position.
export.bytesToChars
function export.bytesToChars(text, pos)
Converts a byte position to the equivalent character position.
export.len
function export.len(text)
A version of len which uses string.len, but returns the same result as mw.ustring.len.
export.sub
function export.sub(text, i_char, j_char)
A version of sub which uses string.sub, but returns the same result as mw.ustring.sub.
export.lower
function export.lower(text)
A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.
export.upper
function export.upper(text)
A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.
export.find
function export.find(text, pattern, init_char, plain)
A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.
export.match
function export.match(text, pattern, init)
A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.
export.gmatch
function export.gmatch(text, pattern)
A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.
export.gsub
function export.gsub(text, pattern, repl, n)
A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.
export.capturing_split
function export.capturing_split(str, pattern)
-- Reimplementation of mw.ustring.split() that includes any capturing
-- groups in the splitting pattern. This works like Python's re.split()
-- function, except that it has Lua's behavior when the split pattern
-- is empty (i.e. advancing by one character at a time; Python returns the
-- whole remainder of the string).
export.ucfirst
function export.ucfirst(text)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.lcfirst
function export.lcfirst(text)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.nowiki
function export.nowiki(text)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.capitalize
function export.capitalize(text)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.pluralize
function export.pluralize(text)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.singularize
function export.singularize(text)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.add_indefinite_article
function export.add_indefinite_article(text, uppercase)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.escape_risky_characters
function export.escape_risky_characters(text)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
local module_name = "string_utilities"
local export = {}
local format_escapes = {
["op"] = "{",
["cl"] = "}",
}
function export.format_fun(str, fun)
return (str:gsub("{(\\?)((\\?)[^{}]*)}", function (p1, name, p2)
if #p1 + #p2 == 1 then
return format_escapes[name] or error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
else
if fun(name) and type(fun(name)) ~= "string" then
error(module_name .. ".format: '" .. name .. "' is a " .. type(fun(name)) .. ", not a string")
end
return fun(name) or error(module_name .. ".format: '" .. name .. "' not found in table")
end
end))
end
--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
====Examples====
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
return export.format_fun(str, function (key) return tbl[key] end)
end
--[==[Explodes a string into an array of UTF8 characters. '''Warning''': this function has no safety checks for non-UTF8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
function export.explode_utf8(str)
local byte = string.byte
local sub = string.sub
local str_len = #str
local text = {}
local n, i, b = 1, 0
while n <= str_len do
b = byte(str, n)
i = i + 1
if b < 0xC0 then
text[i] = sub(str, n, n)
n = n + 1
elseif b < 0xE0 then
text[i] = sub(str, n, n + 1)
n = n + 2
elseif b < 0xF0 then
text[i] = sub(str, n, n + 2)
n = n + 3
else
text[i] = sub(str, n, n + 3)
n = n + 4
end
end
return text
end
-- A helper function which takes a string, position and type ("byte" or "char"), and returns the equivalent position for the other type (e.g. iterate_utf8("字典", 2, "char") returns 4, because character 2 of "字典" begins with byte 4). `pos` can be positive or negative, and the function will iterate over the string forwards or backwards (respectively) until it reaches the input position. Checks byte-by-byte; skipping over trailing bytes, and then calculating the correct byte trail for any leading bytes (i.e. how many trailing bytes should follow); these trailing bytes are then checked together.
-- The optional parameters `init_from_type` and `init_to_type` can be used to start part-way through an iteration to improve performance, if multiple values need to be returned from the same string. For example, iterate_utf8("слова́рь", 11, "byte", 5, 3) will begin checking at byte 5/the start of character 3. Note: The function won't check if these values match each other (as the only way to do this would be to run the iteration from the beginning), so mismatched values will return incorrect results.
local function iterate_utf8(text, pos, from_type, init_from_type, init_to_type)
-- Position 0 is always valid and never changes.
if pos == 0 then
return pos
end
local to_type
if from_type == "char" then
to_type = "byte"
else
to_type = "char"
end
-- Positive positions iterate forwards; negative positions iterate backwards.
local iterate_val
if pos > 0 then
iterate_val = 1
else
iterate_val = -1
end
-- Adjust init_from_type and init_to_type to the iteration before, so that matches for the position given by them will work.
local trail, cp, min, b = 0
local c, leading_byte = {}
c[from_type] = init_from_type and init_from_type ~= 0 and init_from_type - iterate_val or 0
c[to_type] = init_to_type and init_to_type ~= 0 and init_to_type - iterate_val or 0
while true do
if pos > 0 then
b = text:byte(c.byte + 1)
else
b = text:byte(text:len() + c.byte)
end
-- Position byte doesn't exist, so iterate the return value and return it.
if not b then
return c[to_type] + iterate_val
elseif b < 0x80 then
-- 1-byte codepoint, 00-7F.
trail = 0
cp = b
min = 0
leading_byte = true
elseif b < 0xc0 then
-- A trailing byte.
leading_byte = false
elseif b < 0xc2 then
-- An overlong encoding for a 1-byte codepoint.
error("String " .. text .. " is not UTF-8.")
elseif b < 0xe0 then
-- 2-byte codepoint, C2-DF.
trail = 1
cp = b - 0xc0
min = 0x80
leading_byte = true
elseif b < 0xf0 then
-- 3-byte codepoint, E0-EF.
trail = 2
cp = b - 0xe0
min = 0x800
leading_byte = true
elseif b < 0xf4 then
-- 4-byte codepoint, F0-F3.
trail = 3
cp = b - 0xf0
min = 0x10000
leading_byte = true
elseif b == 0xf4 then
-- 4-byte codepoint, F4.
-- Make sure it doesn't decode to over U+10FFFF.
if text:byte(c.byte + 2) > 0x8f then
error("String " .. text .. " is not UTF-8.")
end
trail = 3
cp = 4
min = 0x100000
leading_byte = true
else
-- Codepoint over U+10FFFF, or invalid byte.
error("String " .. text .. " is not UTF-8.")
end
-- Check subsequent bytes for multibyte codepoints.
if leading_byte then
local from, to
if pos > 0 then
from, to = c.byte + 2, c.byte + 1 + trail
else
from, to = text:len() + c.byte + 1, text:len() + c.byte + trail
end
for trailing_byte = from, to do
b = text:byte(trailing_byte)
if not b or b < 0x80 or b > 0xbf then
error("String " .. text .. " is not UTF-8.")
end
cp = cp * 0x40 + b - 0x80
end
local next_byte = text:byte(to + 1)
if next_byte and next_byte >= 0x80 and next_byte <= 0xbf then
-- Too many trailing bytes.
error("String " .. text .. " is not UTF-8.")
elseif cp < min then
-- Overlong encoding.
error("String " .. text .. " is not UTF-8.")
end
end
c.byte = c.byte + iterate_val
if leading_byte then
c.char = c.char + iterate_val
end
if c[from_type] == pos then
return c[to_type]
end
end
end
--[==[Converts a character position to the equivalent byte position.]==]
function export.charsToBytes(text, pos)
return iterate_utf8(text, pos, "char")
end
--[==[Converts a byte position to the equivalent character position.]==]
function export.bytesToChars(text, pos)
local byte = text:byte(pos)
if byte and byte >= 0x80 and byte <= 0xbf then
error("Byte " .. pos .. " is not a leading byte.")
end
return iterate_utf8(text, pos, "byte")
end
-- A helper function which iterates through a pattern, and returns two values: a potentially modified version of the pattern, and a boolean indicating whether the returned pattern is simple (i.e. whether it can be used with the stock string library); if not, then the pattern is complex (i.e. it must be used with the ustring library, which is much more resource-intensive).
local function patternSimplifier(text, pattern, plain)
pattern = tostring(pattern)
-- If `plain` is set, then the pattern is treated as literal (so is always simple). Only used by find.
if plain then
return pattern, true
--If none of these are present, then the pattern has to be simple.
elseif not (
pattern:match("%[.-[\128-\255].-%]") or
pattern:match("[\128-\255][%*%+%?%-]") or
pattern:match("%%[abcdlpsuwxACDLPSUWXZ]") or
pattern:match("%[%^[^%]]+%]") or
pattern:match("%.[^%*%+%-]") or
pattern:match("%.$") or
pattern:match("%%b.?[\128-\255]") or
pattern:match("()", 1, true)
) then
return pattern, true
end
-- Otherwise, the pattern could go either way.
-- Build up the new pattern in a table, then concatenate at the end. we do it this way, as occasionally entries get modified along the way.
local new_pattern = {}
local len, pos, b = pattern:len(), 0
local char, next_char
-- `escape` and `balanced` are counters, which ensure the effects of % or %b (respectively) are distributed over the following bytes.
-- `set` is a boolean that states whether the current byte is in a charset.
-- `capture` keeps track of how many layers of capture groups the position is in, while `captures` keeps a tally of how many groups have been detected (due to the string library limit of 32).
local escape, set, balanced, capture, captures = 0, false, 0, 0, 0
while pos < len do
pos = pos + 1
b = pattern:byte(pos)
if escape > 0 then escape = escape - 1 end
if balanced > 0 then balanced = balanced - 1 end
char = next_char or pattern:sub(pos, pos)
next_char = pattern:sub(pos + 1, pos + 1)
if escape == 0 then
if char == "%" then
-- Apply % escape.
if next_char == "." or next_char == "%" or next_char == "[" or next_char == "]" then
escape = 2
if balanced > 0 then balanced = balanced + 1 end
-- These charsets make the pattern complex.
elseif next_char:match("[acdlpsuwxACDLPSUWXZ]") then
return pattern, false
-- This is "%b".
elseif next_char == "b" then
balanced = 4
end
-- Enter or leave a charset.
elseif char == "[" then
set = true
elseif char == "]" then
set = false
elseif char == "(" then
capture = capture + 1
elseif char == ")" then
if capture > 0 and set == false and balanced == 0 then
captures = captures + 1
capture = capture - 1
end
end
end
-- Multibyte char.
if b > 0x7f then
-- If followed by "*", "+" or "-", then 2-byte chars can be converted into charsets. However, this is not possible with 3 or 4-byte chars, as the charset would be too permissive, because if the trailing bytes were in a different order then this could be a different valid character.
if next_char == "*" or next_char == "+" or next_char == "-" then
local prev_pos = pattern:byte(pos - 1)
if prev_pos > 0xc1 and prev_pos < 0xe0 then
new_pattern[#new_pattern] = "[" .. new_pattern[#new_pattern]
table.insert(new_pattern, char .. "]")
else
return pattern, false
end
-- If in a charset or used in "%b", then the pattern is complex.
-- If followed by "?", add "?" after each byte.
elseif next_char == "?" then
table.insert(new_pattern, char .. "?")
local check_pos, check_b, i = pos, pattern:byte(pos), #new_pattern
while check_b and check_b < 0xc0 do
check_pos = check_pos - 1
check_b = pattern:byte(check_pos)
i = i - 1
new_pattern[i] = new_pattern[i] .. "?"
end
pos = pos + 1
next_char = pattern:sub(pos + 1, pos + 1)
elseif set or balanced > 0 then
return pattern, false
else
table.insert(new_pattern, char)
end
elseif char == "." then
-- "*", "+", "-" are always okay after ".", as they don't care how many bytes a char has.
if set or next_char == "*" or next_char == "+" or next_char == "-" or escape > 0 then
table.insert(new_pattern, char)
-- If followed by "?", make sure "?" is after the leading byte of the UTF-8 char pattern, then skip forward one.
elseif next_char == "?" then
table.insert(new_pattern, "[%z\1-\127\194-\244]?[\128-\191]*")
pos = pos + 1
next_char = pattern:sub(pos + 1, pos + 1)
-- If used with "%b", pattern is complex.
elseif balanced > 0 then
return pattern, false
-- Otherwise, add the UTF-8 char pattern.
else
table.insert(new_pattern, "[%z\1-\127\194-\244][\128-\191]*")
end
-- Negative charsets are always complex, unless the text has no UTF-8 chars.
elseif char == "[" and next_char == "^" and escape == 0 and text:match("[\128-\255]") then
return pattern, false
-- "()" matches the position unless escaped or used with "%b", so always necessitates ustring (as we need it to match the char position, not the byte one).
elseif char == "(" and next_char == ")" and balanced == 0 and escape == 0 and text:match("[\128-\255]") then
return pattern, false
else
table.insert(new_pattern, char)
end
end
if captures > 32 then
return pattern, false
else
pattern = table.concat(new_pattern)
return pattern, true
end
end
--[==[A version of len which uses string.len, but returns the same result as mw.ustring.len.]==]
function export.len(text)
text = tostring(text)
local len_bytes = text:len()
if not text:match("[\128-\255]") then
return len_bytes
else
return iterate_utf8(text, len_bytes, "byte")
end
end
--[==[A version of sub which uses string.sub, but returns the same result as mw.ustring.sub.]==]
function export.sub(text, i_char, j_char)
text = tostring(text)
if not text:match("[\128-\255]") then
return text:sub(i_char, j_char)
end
local i_byte, j_byte
if j_char then
if i_char > 0 and j_char > 0 then
if j_char < i_char then return "" end
i_byte = iterate_utf8(text, i_char, "char")
j_byte = iterate_utf8(text, j_char + 1, "char", i_char, i_byte) - 1
elseif i_char < 0 and j_char < 0 then
if j_char < i_char then return "" end
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
i_byte = iterate_utf8(text, i_char, "char", j_char, j_byte)
-- For some reason, mw.ustring.sub with i=0, j=0 returns the same result as for i=1, j=1, while string.sub always returns "". However, mw.ustring.sub does return "" with i=1, j=0. As such, we need to adjust j_char to 1 if i_char is either 0, or negative with a magnitude greater than the length of the string.
elseif j_char == 0 then
i_byte = iterate_utf8(text, i_char, "char")
if i_byte == 0 or -i_byte > text:len() then j_char = 1 end
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
else
i_byte = iterate_utf8(text, i_char, "char")
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
end
else
i_byte = iterate_utf8(text, i_char, "char")
end
return text:sub(i_byte, j_byte)
end
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
function export.lower(text)
text = tostring(text)
if not text:match("[\128-\255]") then
return text:lower()
else
return mw.ustring.lower(text)
end
end
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
function export.upper(text)
text = tostring(text)
if not text:match("[\128-\255]") then
return text:upper()
else
return mw.ustring.upper(text)
end
end
--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
function export.find(text, pattern, init_char, plain)
text = tostring(text)
local simple
pattern, simple = patternSimplifier(text, pattern, plain)
-- If the pattern is simple but multibyte characters are present, then init_char needs to be converted into bytes for string.find to work properly, and the return values need to be converted back into chars.
if simple then
if not text:match("[\128-\255]") then
return text:find(pattern, init_char, plain)
else
local init_byte = init_char and iterate_utf8(text, init_char, "char")
local byte1, byte2, c1, c2, c3, c4, c5, c6, c7, c8, c9 = text:find(pattern, init_byte, plain)
-- If string.find returned nil, then return nil.
if not (byte1 and byte2) then
return nil
end
-- Get first return value. If we have a positive init_char, we can save resources by resuming at that point.
local char1, char2
if (not init_char) or init_char > 0 then
char1 = iterate_utf8(text, byte1, "byte", init_byte, init_char)
else
char1 = iterate_utf8(text, byte1, "byte")
end
-- If byte1 and byte2 are the same, don't bother running iterate_utf8 twice. Otherwise, resume iterate_utf8 from byte1 to find char2.
if byte1 == byte2 then
char2 = char1
else
char2 = iterate_utf8(text, byte2, "byte", byte1, char1)
end
return unpack{char1, char2, c1, c2, c3, c4, c5, c6, c7, c8, c9}
end
else
return mw.ustring.find(text, pattern, init_char, plain)
end
end
--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
function export.match(text, pattern, init)
text = tostring(text)
local simple
pattern, simple = patternSimplifier(text, pattern)
if simple then
if init and text:find("[\128-\255]") then
init = iterate_utf8(text, init, "char")
end
return text:match(pattern, init)
else
return mw.ustring.match(text, pattern, init)
end
end
--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
function export.gmatch(text, pattern)
text = tostring(text)
local simple
pattern, simple = patternSimplifier(text, pattern)
if simple then
return text:gmatch(pattern)
else
return mw.ustring.gmatch(text, pattern)
end
end
--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
function export.gsub(text, pattern, repl, n)
text = tostring(text)
local simple
pattern, simple = patternSimplifier(text, pattern)
if simple then
return text:gsub(pattern, repl, n)
else
return mw.ustring.gsub(text, pattern, repl, n)
end
end
--[==[
-- Reimplementation of mw.ustring.split() that includes any capturing
-- groups in the splitting pattern. This works like Python's re.split()
-- function, except that it has Lua's behavior when the split pattern
-- is empty (i.e. advancing by one character at a time; Python returns the
-- whole remainder of the string).
]==]
function export.capturing_split(str, pattern)
local ret = {}
-- (.-) corresponds to (.*?) in Python or Perl; () captures the
-- current position after matching.
pattern = "(.-)" .. pattern .. "()"
local start = 1
while true do
-- Did we reach the end of the string?
if start > #str then
table.insert(ret, "")
return ret
end
-- match() returns all captures as multiple return values;
-- we need to insert into a table to get them all.
local captures = {export.match(str, pattern, start)}
-- If no match, add the remainder of the string.
if #captures == 0 then
table.insert(ret, export.sub(str, start))
return ret
end
local newstart = table.remove(captures)
-- Special case: If we don't advance by any characters, then advance
-- by one character; this avoids an infinite loop, and makes splitting
-- by an empty string work the way mw.ustring.split() does. If we
-- reach the end of the string this way, return immediately, so we
-- don't get a final empty string.
if newstart == start then
table.insert(ret, export.sub(str, start, start))
table.remove(captures, 1)
start = start + 1
if start > #str then
return ret
end
else
table.insert(ret, table.remove(captures, 1))
start = newstart
end
-- Insert any captures from the splitting pattern.
for _, x in ipairs(captures) do
table.insert(ret, x)
end
end
end
local function uclcfirst(text, dolower)
local function douclcfirst(text)
-- Actual function to re-case of the first letter.
local first_letter = export.sub(text, 1, 1)
first_letter = dolower and export.lower(first_letter) or export.upper(first_letter)
return first_letter .. export.sub(text, 2)
end
-- If there's a link at the beginning, re-case the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
if link then
return "[[" .. link .. "|" .. douclcfirst(linktext ~= "" and linktext or link) .. "]]" .. remainder
end
return douclcfirst(text)
end
function export.ucfirst(text)
return uclcfirst(text, false)
end
function export.lcfirst(text)
return uclcfirst(text, true)
end
-- Faster version of mw.text.nowiki, with minor changes to match the PHP equivalent: ";" always escapes, and colons in unslashed protocols only escape after regex \b.
do
local function escape_char(str1, str2)
if str2 then
return str1 .. "&#" .. str2:byte() .. ";"
end
return "&#" .. str1:byte() .. ";"
end
local function escape_uri(uri)
local uri_schemes = mw.loadData("Module:string utilities/data").uri_schemes
return uri_schemes[uri:lower()] and uri .. ":" or uri .. ":"
end
function export.nowiki(text)
return (text
:gsub("[\"&';<=>%[%]{|}]", escape_char)
:gsub("^[\t\n\r #%*:]", escape_char)
:gsub("([\n\r])([\t\n\r #%*:])", escape_char)
:gsub("%f[^%z\r\n]%-(%-%-%-)", "-%1")
:gsub("__", "__")
:gsub("://", "://")
:gsub("(ISBN)(%s)", escape_char)
:gsub("(PMID)(%s)", escape_char)
:gsub("(RFC)(%s)", escape_char)
:gsub("([%w_]+):", escape_uri))
end
end
function export.capitalize(text)
if type(text) == "table" then
-- allow calling from a template
text = text.args[1]
end
-- Capitalize multi-word that is separated by spaces
-- by uppercasing the first letter of each part.
-- I assume nobody will input all CAP text.
w2 = {}
for w in export.gmatch(text, "%S+") do
table.insert(w2, uclcfirst(w, false))
end
return table.concat(w2, " ")
end
function export.pluralize(text)
if type(text) == "table" then
-- allow calling from a template
text = text.args[1]
end
-- Pluralize a word in a smart fashion, according to normal English rules.
-- 1. If word ends in consonant + -y, replace the -y with -ies.
-- 2. If the word ends in -s, -x, -z, -sh, -ch, add -es.
-- 3. Otherwise, add -s.
-- This handles links correctly:
-- 1. If a piped link, change the second part appropriately.
-- 2. If a non-piped link and rule #1 above applies, convert to a piped link
-- with the second part containing the plural.
-- 3. If a non-piped link and rules #2 or #3 above apply, add the plural
-- outside the link.
local function word_ends_in_consonant_plus_y(text)
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
-- apply to proper nouns, hence "the Gettys", "the public Ivys".
-- We should maybe consider applying this rule here; but it may not
-- be important as this function is almost always called on common nouns
-- (e.g. parts of speech, place types).
return text:find("[^aeiouAEIOU ]y$")
end
local function word_takes_es_plural(text)
return text:find("[sxz]$") or text:find("[cs]h$")
end
local function do_pluralize(text)
if word_ends_in_consonant_plus_y(text) then
-- avoid returning multiple values
local hack_single_retval = text:gsub("y$", "ies")
return hack_single_retval
elseif word_takes_es_plural(text) then
return text .. "es"
else
return text .. "s"
end
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
if link then
if linktext ~= "" then
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
end
if word_ends_in_consonant_plus_y(link) then
return beginning .. "[[" .. link .. "|" .. link:gsub("y$", "ies") .. "]]"
end
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
end
return do_pluralize(text)
end
function export.singularize(text)
if type(text) == "table" then
-- allow calling from a template
text = text.args[1]
end
-- Singularize a word in a smart fashion, according to normal English rules.
-- Works analogously to pluralize().
-- NOTE: This doesn't always work as well as pluralize(). Beware. It will
-- mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
-- 1. If word ends in -ies, replace -ies with -y.
-- 2. If the word ends in -xes, -shes, -ches, remove -es. [Does not affect
-- -ses, cf. "houses", "impasses".]
-- 3. Otherwise, remove -s.
-- This handles links correctly:
-- 1. If a piped link, change the second part appropriately. Collapse the
-- link to a simple link if both parts end up the same.
-- 2. If a non-piped link, singularize the link.
-- 3. A link like "[[parish]]es" will be handled correctly because the
-- code that checks for -shes etc. allows ] characters between the
-- 'sh' etc. and final -es.
local function do_singularize(text)
local sing = text:match("^(.-)ies$")
if sing then
return sing .. "y"
end
-- Handle cases like "[[parish]]es"
local sing = text:match("^(.-[sc]h%]*)es$")
if sing then
return sing
end
-- Handle cases like "[[box]]es"
local sing = text:match("^(.-x%]*)es$")
if sing then
return sing
end
local sing = text:match("^(.-)s$")
if sing then
return sing
end
return text
end
local function collapse_link(link, linktext)
if link == linktext then
return "[[" .. link .. "]]"
else
return "[[" .. link .. "|" .. linktext .. "]]"
end
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
if link then
if linktext ~= "" then
return beginning .. collapse_link(link, do_singularize(linktext))
end
return beginning .. "[[" .. do_singularize(link) .. "]]"
end
return do_singularize(text)
end
function export.add_indefinite_article(text, uppercase)
local is_vowel = false
-- If there's a link at the beginning, examine the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
if link then
is_vowel = export.find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
else
is_vowel = export.find(text, "^[AEIOUaeiou]")
end
return (is_vowel and (uppercase and "An " or "an ") or (uppercase and "A " or "a ")) .. text
end
-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
function export.escape_risky_characters(text)
-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software.
if not mw.ustring.match(text, "%S") then
return mw.text.encode(text, "%s")
else
return mw.text.encode(text, "!#%%&*+/:;<=>?@[\\%]_{|}")
end
end
return export