Module:languages: Difference between revisions

From Acadēmīa Latīnitātis
(Created page with "local export = {} local function do_replacements(text, self, sc, replacement_data, function_name, recursed) -- If there are language-specific substitutes given in the data module, use those. if type(replacement_data) == "table" then -- If a script is specified, run this function with the script-specific data before continuing. local sc_code = sc:getCode() if replacement_data[sc_code] then text = do_replacements(text, self, sc, replacement_data[sc_code], funct...")
 
No edit summary
 
Line 11: Line 11:
elseif replacement_data.Han and (sc_code == "Hant" or sc_code == "Hans" or sc_code == "Hani") then
elseif replacement_data.Han and (sc_code == "Hant" or sc_code == "Hans" or sc_code == "Hani") then
text = do_replacements(text, self, sc, replacement_data.Han, function_name, true)
text = do_replacements(text, self, sc, replacement_data.Han, function_name, true)
-- Replacement data with key 1 in the outer table may be given as a fallback.
elseif replacement_data[1] then
text = do_replacements(text, self, sc, replacement_data[1], function_name, true)
end
end
-- Iterate over all strings in the "from" subtable, and gsub with the corresponding string in "to". We work with the NFD decomposed forms, as this simplifies many replacements.
-- Iterate over all strings in the "from" subtable, and gsub with the corresponding string in "to". We work with the NFD decomposed forms, as this simplifies many replacements.
Line 29: Line 32:
-- This will need to be reviewed if any characters in the SIP (U+2XXXX) or TIP (U+3XXXX) need to be processed by it, but as these planes are exclusively CJK characters as of 2022, this is unlikely to happen for the time being. However, it is unwise to start using non-PUA codepoints in the U+4XXXX-U+EXXXX range, as support for these is completely untested, so they may result in unpredictable behaviour.
-- This will need to be reviewed if any characters in the SIP (U+2XXXX) or TIP (U+3XXXX) need to be processed by it, but as these planes are exclusively CJK characters as of 2022, this is unlikely to happen for the time being. However, it is unwise to start using non-PUA codepoints in the U+4XXXX-U+EXXXX range, as support for these is completely untested, so they may result in unpredictable behaviour.
if replacement_data.remove_exceptions then
if replacement_data.remove_exceptions then
local char, codepoint, len, substitute = mw.ustring.char, mw.ustring.codepoint, mw.ustring.len
local u, codepoint, len, substitute = mw.ustring.char, mw.ustring.codepoint, mw.ustring.len
for _,exception in ipairs(replacement_data.remove_exceptions) do
for _,exception in ipairs(replacement_data.remove_exceptions) do
exception = sc:toFixedNFD(exception)
exception = sc:toFixedNFD(exception)
substitute = {codepoint(exception, 1, len(exception))}
substitute = {codepoint(exception, 1, len(exception))}
for i, codepoint in ipairs(substitute) do substitute[i] = char(codepoint+0xF0000) end
for i, codepoint in ipairs(substitute) do substitute[i] = u(codepoint+0xF0000) end
text = text:gsub(exception, table.concat(substitute))
text = text:gsub(exception, table.concat(substitute))
end
end
Line 42: Line 45:
if replacement_data.remove_exceptions then
if replacement_data.remove_exceptions then
for _,exception in ipairs(replacement_data.remove_exceptions) do
for _,exception in ipairs(replacement_data.remove_exceptions) do
local char, codepoint, len, substitute = mw.ustring.char, mw.ustring.codepoint, mw.ustring.len
local u, codepoint, len, substitute = mw.ustring.char, mw.ustring.codepoint, mw.ustring.len
exception = sc:toFixedNFD(exception)
exception = sc:toFixedNFD(exception)
substitute = {codepoint(exception, 1, len(exception))}
substitute = {codepoint(exception, 1, len(exception))}
for i, codepoint in ipairs(substitute) do substitute[i] = char(codepoint+0xF0000) end
for i, codepoint in ipairs(substitute) do substitute[i] = u(codepoint+0xF0000) end
text = text:gsub(table.concat(substitute), exception)
text = text:gsub(table.concat(substitute), exception)
end
end
Line 80: Line 83:
return self._code
return self._code
end
end


function Language:getCanonicalName()
function Language:getCanonicalName()
return self._rawData[1] or self._rawData.canonicalName
return self._rawData[1] or self._rawData.canonicalName
end
end


function Language:getDisplayForm()
function Language:getDisplayForm()
return self:getCanonicalName()
return self:getCanonicalName()
end
end


function Language:getOtherNames(onlyOtherNames)
function Language:getOtherNames(onlyOtherNames)
Line 96: Line 96:
return require("Module:language-like").getOtherNames(self, onlyOtherNames)
return require("Module:language-like").getOtherNames(self, onlyOtherNames)
end
end


function Language:getAliases()
function Language:getAliases()
Line 102: Line 101:
return self._extraData.aliases or {}
return self._extraData.aliases or {}
end
end


function Language:getVarieties(flatten)
function Language:getVarieties(flatten)
Line 108: Line 106:
return require("Module:language-like").getVarieties(self, flatten)
return require("Module:language-like").getVarieties(self, flatten)
end
end


function Language:getType()
function Language:getType()
return self._rawData.type or "regular"
return self._rawData.type or "regular"
end
end


function Language:getWikimediaLanguages()
function Language:getWikimediaLanguages()
Line 187: Line 183:
return self._familyObject
return self._familyObject
end
end


function Language:getAncestors()
function Language:getAncestors()
Line 244: Line 239:
return self._ancestorChain
return self._ancestorChain
end
end


function Language:hasAncestor(otherlang)
function Language:hasAncestor(otherlang)
Line 253: Line 247:
return iterateOverAncestorTree(self, compare) or false
return iterateOverAncestorTree(self, compare) or false
end
end


function Language:getCategoryName(nocap)
function Language:getCategoryName(nocap)
Line 267: Line 260:
return name
return name
end
end


function Language:makeCategoryLink()
function Language:makeCategoryLink()
return "[[:Category:" .. self:getCategoryName() .. "|" .. self:getDisplayForm() .. "]]"
return "[[:Category:" .. self:getCategoryName() .. "|" .. self:getDisplayForm() .. "]]"
end
end


function Language:getStandardCharacters()
function Language:getStandardCharacters()
Line 278: Line 269:
end
end


local function getEntities(text)
return text
:gsub("&#(%d+);", mw.ustring.char)
:gsub("&#x(%x+);", function(cap1) return mw.ustring.char(tonumber(cap1, 16)) end)
end
local function processCarets(text)
local u = mw.ustring.char
return text
:gsub("\\\\^", u(0xE000) .. "^")
:gsub("\\^", u(0xE001))
:gsub("%^", "")
:gsub(u(0xE000), "\\")
:gsub(u(0xE001), "^")
end


-- If an initial colon is present and the text doesn't match an unsupported title beginning with a colon, remove it and return two values: the modified text and true. Otherwise, return text.
local function escapeUnsupportedTitle(text)
local function processEscape(text)
text = {mw.ustring.codepoint(text, 1, mw.ustring.len(text))}
local escaped
for j, char in ipairs(text) do
if text and text:match("^:") and not mw.loadData("Module:links/data").unsupported_titles[text] then
text[j] = "&#" .. char .. ";"
text = text:gsub("^:", "")
escaped = true
end
end
return text, not not escaped
return table.concat(text)
end
end


 
function Language:makeEntryName(text, sc, unsupportedTitle)
function Language:makeEntryName(text, sc)
if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
-- Strip bold.
-- Strip bold and soft hyphens.
text = text:gsub("('*)'''(.-'*)'''", "%1%2")
text = text
-- Strip soft hyphens.
:gsub("('*)'''(.-'*)'''", "%1%2")
text = text:gsub("­", "")
:gsub("­", "")
text = mw.text.unstrip(text)
-- Don't strip italics, as that would allow people to use it instead of {{m}} etc.
-- Don't strip italics, as that would allow people to use it instead of {{m}} etc.
local escaped; text, escaped = processEscape(text)
-- Remove caret if it is used to capitalize parts of transliterations (unless this has been escaped).
if not escaped and not sc:hasCapitalization() and sc:getCode() ~= "None" then text = text:gsub("%^", "") end
text = mw.text.unstrip(text)
text = sc:fixDiscouragedSequences(text)
text = sc:fixDiscouragedSequences(text)
text = sc:toFixedNFD(text)
text = sc:toFixedNFD(text)
text = mw.ustring.match(text, "^[¿¡]?(.-[^%s%p].-)%s*[؟?!;՛՜ ՞ ՟?!︖︕।॥။၊་།]?$") or text
return do_replacements(text, self, sc, self._rawData.entry_name, "makeEntryName")
-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
if not sc:hasCapitalization() and sc:getCode() ~= "None" and text:find("%^") then
text = processCarets(text)
end
-- Deal with interwiki link prefixes.
if text:find(":") then
local u = mw.ustring.char
text = text
:gsub("\\\\:", u(0xE000) .. ":")
:gsub("\\:", u(0xE001))
:gsub(u(0xE000), "\\")
if not text:find("^:") then
text = do_replacements(text, self, sc, self._rawData.entry_name, "makeEntryName")
else
text = text:gsub("^:", "")
end
if text:find(u(0xE001)) then
text = text:gsub(u(0xE001), ":")
end
else
text = do_replacements(text, self, sc, self._rawData.entry_name, "makeEntryName")
end
local unsupportedTitle = mw.loadData("Module:links/data").unsupported_titles[text]
if unsupportedTitle then
return "Unsupported titles/" .. unsupportedTitle, true
end
return mw.ustring.match(text, "^[¿¡]?(.-[^%s%p].-)%s*[؟?!;՛՜ ՞ ՟?!︖︕।॥။၊་།]?$") or text
end
end


-- Generates alternative forms using a specified method, and returns them as a table. If no method is specified, returns a table containing only the input term.
function Language:generateForms(text, sc)
if self._rawData.generate_forms then
if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
return require("Module:" .. self._rawData.generate_forms).generateForms(text, self:getCode(), sc:getCode())
else
return {text}
end
end


-- Return true if the language has display processing enabled, i.e. lang:makeDisplayText()
-- Return true if the language has display processing enabled, i.e. lang:makeDisplayText()
Line 314: Line 351:
return not not self._rawData.display_text
return not not self._rawData.display_text
end
end


-- Apply display-text replacements to `text`, if any.
-- Apply display-text replacements to `text`, if any.
function Language:makeDisplayText(text, sc, keepCarets)
function Language:makeDisplayText(text, sc, unsupportedTitle, keepCarets, keepColons)
if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
local escaped; text, escaped = processEscape(text)
-- Remove caret if it is used to capitalize parts of transliterations (unless this has been escaped).
if not escaped and not keepCarets and not sc:hasCapitalization() and sc:getCode() ~= "None" then text = text:gsub("%^", "") end
-- Remove any interwiki link prefixes.
if text:match(":") and not escaped then
local prefix = text:match("^([^:]+):")
local interwikis = mw.loadData("Module:languages/shareddata").interwikis
if interwikis[prefix] then
-- Remove prefix plus colon.
text = text:sub(#prefix + 2)
-- If, additionally, there's a language code after the interwiki link, strip that too.
local languageCode = text:match("^([^:]+):")
if languageCode then
text = text:sub(#languageCode + 2)
end
end
end
-- Temporarily convert strip markers to PUA characters to prevent them from being disrupted by the substitution process.
-- Temporarily convert strip markers to PUA characters to prevent them from being disrupted by the substitution process.
local u, i, stripMarkers = mw.ustring.char, 1, {}
local u, i, stripMarkers = mw.ustring.char, 1, {}
for stripMarker in text:gmatch("[.*]-" .. u(0x7F) .. "'\"`UNIQ%-%-%l+%-%x+%-QINU`\"'" .. u(0x7F)) do
for stripMarker in text:gmatch(u(0x7F) .. "'\"`UNIQ%-%-%l+%-%x+%-QINU`\"'" .. u(0x7F)) do
stripMarkers[i] = stripMarker
stripMarkers[i] = stripMarker
text = text:gsub(stripMarker, u(0xE700+i), 1)
stripMarker = stripMarker:gsub("-", "%%-")
text = text:gsub(stripMarker, u(0xFF700+i), 1)
i = i + 1
i = i + 1
end
end
text = unsupportedTitle and text or getEntities(text)
if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
text = sc:fixDiscouragedSequences(text)
text = sc:fixDiscouragedSequences(text)
text = sc:toFixedNFD(text)
text = sc:toFixedNFD(text)
text = do_replacements(text, self, sc, self._rawData.display_text, "makeDisplayText")
-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
if not keepCarets and not sc:hasCapitalization() and sc:getCode() ~= "None" and text:find("%^") then
text = processCarets(text)
end
-- Remove any interwiki link prefixes (unless they have been escaped or this has been disabled).
if text:find(":") and not keepColons then
local u = mw.ustring.char
text = text
:gsub("\\\\:", u(0xE000) .. ":")
:gsub("\\:", u(0xE001))
local prefix = text:match("^([^:]*):") or ""
local interwikis = mw.loadData("Module:languages/shareddata").interwikis
if interwikis[prefix] and text:find(":") ~= #text then
-- Remove prefix plus colon.
text = text:gsub("^" .. prefix .. ":(.*)", "%1")
-- If, additionally, there's a language code after the interwiki link, strip that too.
if text:find(":") ~= #text then text = text:gsub("^[^:]*:(.*)", "%1") end
end
text = text
:gsub(u(0xE000), "\\")
:gsub(u(0xE001), ":")
end
text = unsupportedTitle and escapeUnsupportedTitle(text) or text
for j = 1, #stripMarkers do
for j = 1, #stripMarkers do
text = text:gsub(u(0xE700+j), stripMarkers[j])
text = text
:gsub(u(0xFF700+j), stripMarkers[j])
:gsub("&#" .. 0xFF700+j .. ";", stripMarkers[j])
end
end
 
-- Return whether the text was escaped, as this is used by the transliterate function.
return text
return do_replacements(text, self, sc, self._rawData.display_text, "makeDisplayText"), escaped
end
end


function Language:makeSortKey(text, sc)
function Language:makeSortKey(text, sc, unsupportedTitle)
text = unsupportedTitle and text or getEntities(text)
if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
local escaped; text, escaped = processEscape(text)
-- Remove caret if it is used to capitalize parts of transliterations (unless this has been escaped).
if not escaped and not sc:hasCapitalization() and sc:getCode() ~= "None" then text = text:gsub("%^", "") end
local gsub = mw.ustring.gsub
-- Remove initial hyphens and *.
-- Remove initial hyphens and *.
text = mw.text.unstrip(text)
text = mw.text.unstrip(text)
text = gsub(text, "^[-־ـ᠊*]+(.)", "%1")
text = mw.ustring.gsub(text, "^[-־ـ᠊*]+(.)", "%1")
--Normalize.
--Normalize.
text = sc:fixDiscouragedSequences(text)
text = sc:fixDiscouragedSequences(text)
text = sc:toFixedNFD(text)
text = sc:toFixedNFD(text)
-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
if not sc:hasCapitalization() and sc:getCode() ~= "None" and text:find("%^") then
text = processCarets(text)
end
-- For languages with dotted dotless i, ensure that "İ" is sorted as "i", and "I" is sorted as "ı".
-- For languages with dotted dotless i, ensure that "İ" is sorted as "i", and "I" is sorted as "ı".
if self._rawData.dotted_dotless_i then
if self._rawData.dotted_dotless_i then
text = text:gsub(mw.ustring.toNFD("İ"), "i")
text = text
text = sc:toFixedNFD(text:gsub("I", "ı"))
:gsub(mw.ustring.toNFD("İ"), "i")
:gsub("I", "ı")
text = sc:toFixedNFD(text)
end
end
-- Convert to lowercase, make the sortkey, then convert to uppercase. Where the language has dotted dotless i, it is usually not necessary to convert "i" to "İ" and "ı" to "I" first, because "I" will always be interpreted as conventional "I" (not dotless "İ") by any sorting algorithms, which will have been taken into account by the sortkey substitutions themselves. However, if no sortkey substitutions have been specified, then conversion is necessary so as to prevent "i" and "ı" both being sorted as "I".
-- Convert to lowercase, make the sortkey, then convert to uppercase. Where the language has dotted dotless i, it is usually not necessary to convert "i" to "İ" and "ı" to "I" first, because "I" will always be interpreted as conventional "I" (not dotless "İ") by any sorting algorithms, which will have been taken into account by the sortkey substitutions themselves. However, if no sortkey substitutions have been specified, then conversion is necessary so as to prevent "i" and "ı" both being sorted as "I".
Line 376: Line 432:
text = do_replacements(text, self, sc, self._rawData.sort_key, "makeSortKey")
text = do_replacements(text, self, sc, self._rawData.sort_key, "makeSortKey")
if self._rawData.dotted_dotless_i and not self._rawData.sort_key then
if self._rawData.dotted_dotless_i and not self._rawData.sort_key then
text = text:gsub("ı", "I")
text = text
text = sc:toFixedNFC(text:gsub("i", "İ"))
:gsub("ı", "I")
:gsub("i", "İ")
text = sc:toFixedNFC(text)
end
end
text = mw.ustring.upper(text)
text = mw.ustring.upper(text)
-- Remove parentheses, as long as they are either preceded or followed by something.
-- Remove parentheses, as long as they are either preceded or followed by something.
text = gsub(text, "(.)[()]+", "%1")
text = text
text = gsub(text, "[()]+(.)", "%1")
:gsub("(.)[()]+", "%1")
:gsub("[()]+(.)", "%1")
return text
return unsupportedTitle and escapeUnsupportedTitle(text) or text
end
end
function Language:overrideManualTranslit()
return not not self._rawData.override_translit
end


function Language:transliterate(text, sc, module_override)
function Language:transliterate(text, sc, module_override)
Line 402: Line 456:
end
end
-- Get the display form, and whether a colon-initial escape has been used.
-- Get the display form.
if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
text = mw.text.unstrip(text)
text = mw.text.unstrip(text)
local escaped; text, escaped = self:makeDisplayText(text, sc, true)
text = self:makeDisplayText(text, sc, nil, true)
-- Transliterate.
-- Transliterate.
text = require("Module:" .. (module_override or self._rawData.translit_module)).tr(text, self:getCode(), sc:getCode())
text = require("Module:" .. (module_override or self._rawData.translit_module)).tr(text, self:getCode(), sc:getCode())
-- If the text hasn't been escaped and the script does not use capitalization, then capitalize any letters of the transliteration which are immediately preceded by a caret, and then remove it. Otherwise, just return the text.
-- If the script does not use capitalization, then capitalize any letters of the transliteration which are immediately preceded by a caret, and then remove it. Otherwise, just return the text.
if text and not escaped and text:match("%^") and not sc:hasCapitalization() and sc:getCode() ~= "None" then
if text and not sc:hasCapitalization() and sc:getCode() ~= "None" and text:find("%^") then
return mw.ustring.gsub(text, "(%^)(%l)", function(a, b) return mw.ustring.upper(b) end)
local u = mw.ustring.char
text = text
:gsub("\\\\^", u(0xE000) .. "^")
:gsub("\\^", u(0xE001))
return mw.ustring.gsub(text, "%^(%l)", mw.ustring.upper)
:gsub(u(0xE000), "\\")
:gsub(u(0xE001), "^")
else
else
return text
return text
end
end
end
function Language:overrideManualTranslit()
return not not self._rawData.override_translit
end
end


Line 421: Line 485:
return self._rawData.translit_module and true or false
return self._rawData.translit_module and true or false
end
end


function Language:link_tr()
function Language:link_tr()
return self._rawData.link_tr and true or false
return self._rawData.link_tr and true or false
end
end


function Language:toJSON()
function Language:toJSON()
Line 461: Line 523:
return require("Module:JSON").toJSON(ret)
return require("Module:JSON").toJSON(ret)
end
end


-- Do NOT use these methods!
-- Do NOT use these methods!
Line 475: Line 536:


Language.__index = Language
Language.__index = Language


function export.getDataModuleName(code)
function export.getDataModuleName(code)
Line 503: Line 563:
end
end
end
end


local function getRawLanguageData(code)
local function getRawLanguageData(code)
Line 509: Line 568:
return modulename and mw.loadData("Module:" .. modulename)[code] or nil
return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end
end


local function getRawExtraLanguageData(code)
local function getRawExtraLanguageData(code)
Line 515: Line 573:
return modulename and mw.loadData("Module:" .. modulename)[code] or nil
return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end
end


function Language:loadInExtraData()
function Language:loadInExtraData()
Line 524: Line 581:
end
end
end
end


function export.makeObject(code, data)
function export.makeObject(code, data)
Line 536: Line 592:
return data and setmetatable({_rawData = data, _code = code, _type = "language object"}, Language) or nil
return data and setmetatable({_rawData = data, _code = code, _type = "language object"}, Language) or nil
end
end


function export.getByCode(code, paramForError, allowEtymLang, allowFamily)
function export.getByCode(code, paramForError, allowEtymLang, allowFamily)
Line 555: Line 610:
return retval
return retval
end
end


function export.getByName(name, errorIfInvalid)
function export.getByName(name, errorIfInvalid)

Latest revision as of 04:18, 14 February 2023

Documentation for this module may be created at Module:languages/doc

local export = {}

local function do_replacements(text, self, sc, replacement_data, function_name, recursed)
	-- If there are language-specific substitutes given in the data module, use those.
	if type(replacement_data) == "table" then
		-- If a script is specified, run this function with the script-specific data before continuing.
		local sc_code = sc:getCode()
		if replacement_data[sc_code] then
			text = do_replacements(text, self, sc, replacement_data[sc_code], function_name, true)
		-- Hant, Hans and Hani don't sort differently, so add a special case to avoid having to specify each one separately.
		elseif replacement_data.Han and (sc_code == "Hant" or sc_code == "Hans" or sc_code == "Hani") then
			text = do_replacements(text, self, sc, replacement_data.Han, function_name, true)
		-- Replacement data with key 1 in the outer table may be given as a fallback.
		elseif replacement_data[1] then
			text = do_replacements(text, self, sc, replacement_data[1], function_name, true)
		end
		-- Iterate over all strings in the "from" subtable, and gsub with the corresponding string in "to". We work with the NFD decomposed forms, as this simplifies many replacements.
		if replacement_data.from then
			local gsub
			for i, from in ipairs(replacement_data.from) do
				-- We normalize each loop, to ensure multi-stage substitutions work correctly.
				text = sc:toFixedNFD(text)
				-- Check whether specific magic characters are present, as they rely on UTF-8 compatibility. If not, just use string.gsub. In most cases, doing this is faster than using mw.ustring.gsub every time.
				if from:match("[%%.[%]*+%-?]") then gsub = mw.ustring.gsub else gsub = string.gsub end
				text = gsub(text, sc:toFixedNFD(from), replacement_data.to[i] or "")
			end
		end
		
		if replacement_data.remove_diacritics then
			text = sc:toFixedNFD(text)
			 -- Convert any specified exceptions into PUA characters, to avoid having diacritics stripped. Uses the supplemetary PUA planes (U+FXXXX & U+10XXXX), to ensure that any characters in the BMP (U+0XXXX) or SMP (U+1XXXX) can be round-trip converted to PUA.
			-- This will need to be reviewed if any characters in the SIP (U+2XXXX) or TIP (U+3XXXX) need to be processed by it, but as these planes are exclusively CJK characters as of 2022, this is unlikely to happen for the time being. However, it is unwise to start using non-PUA codepoints in the U+4XXXX-U+EXXXX range, as support for these is completely untested, so they may result in unpredictable behaviour.
			if replacement_data.remove_exceptions then
				local u, codepoint, len, substitute = mw.ustring.char, mw.ustring.codepoint, mw.ustring.len
				for _,exception in ipairs(replacement_data.remove_exceptions) do
					exception = sc:toFixedNFD(exception)
					substitute = {codepoint(exception, 1, len(exception))}
					for i, codepoint in ipairs(substitute) do substitute[i] = u(codepoint+0xF0000) end
					text = text:gsub(exception, table.concat(substitute))
				end
			end
			-- Strip diacritics. This must use mw.ustring.gsub, to ensure the character class is UTF-8 compatible.
			text = mw.ustring.gsub(text, "[" .. replacement_data.remove_diacritics .. "]", "")
			-- Convert any exceptions back.
			if replacement_data.remove_exceptions then
				for _,exception in ipairs(replacement_data.remove_exceptions) do
					local u, codepoint, len, substitute = mw.ustring.char, mw.ustring.codepoint, mw.ustring.len
					exception = sc:toFixedNFD(exception)
					substitute = {codepoint(exception, 1, len(exception))}
					for i, codepoint in ipairs(substitute) do substitute[i] = u(codepoint+0xF0000) end
					text = text:gsub(table.concat(substitute), exception)
				end
			end
		end
	elseif type(replacement_data) == "string" then
		-- If there is a dedicated function module, use that.
		local is_module, module = pcall(require, "Module:" .. replacement_data)
		if is_module then
			text = module[function_name](sc:toFixedNFD(text), self:getCode(), sc:getCode())
		-- If there is matching shared data, run this function again using that as the replacement data.
		else
			local m_shared = mw.loadData("Module:languages/shareddata")
			if m_shared[replacement_data] then
				return do_replacements(text, self, sc, m_shared[replacement_data], function_name, recursed)
			else
				error("Replacement data does not match any shared data or an existing module.")
			end
		end
	end
	
	if not recursed then
		-- Fix any discouraged sequences created during the substitution process, and normalize into the final form.
		text = sc:fixDiscouragedSequences(text)
		return sc:toFixedNFC(text)
	else
		return text
	end
end

local Language = {}

function Language:getCode()
	return self._code
end

function Language:getCanonicalName()
	return self._rawData[1] or self._rawData.canonicalName
end

function Language:getDisplayForm()
	return self:getCanonicalName()
end

function Language:getOtherNames(onlyOtherNames)
	self:loadInExtraData()
	return require("Module:language-like").getOtherNames(self, onlyOtherNames)
end

function Language:getAliases()
	self:loadInExtraData()
	return self._extraData.aliases or {}
end

function Language:getVarieties(flatten)
	self:loadInExtraData()
	return require("Module:language-like").getVarieties(self, flatten)
end

function Language:getType()
	return self._rawData.type or "regular"
end

function Language:getWikimediaLanguages()
	if not self._wikimediaLanguageObjects then
		local m_wikimedia_languages = require("Module:wikimedia languages")
		self._wikimediaLanguageObjects = {}
		local wikimedia_codes = self._rawData.wikimedia_codes or {self:getCode()}
		
		for _, wlangcode in ipairs(wikimedia_codes) do
			table.insert(self._wikimediaLanguageObjects, m_wikimedia_languages.getByCode(wlangcode))
		end
	end
	
	return self._wikimediaLanguageObjects
end

function Language:getWikipediaArticle()
	if self._rawData.wikipedia_article then
		return self._rawData.wikipedia_article 
	elseif self._wikipedia_article then
		return self._wikipedia_article
	elseif self:getWikidataItem() and mw.wikibase then
		self._wikipedia_article = mw.wikibase.sitelink(self:getWikidataItem(), 'enwiki')
	end
	if not self._wikipedia_article then
		self._wikipedia_article = self:getCategoryName():gsub("Creole language", "Creole")
	end
	
	return self._wikipedia_article
end

function Language:makeWikipediaLink()
	return "[[w:" .. self:getWikipediaArticle() .. "|" .. self:getCanonicalName() .. "]]"
end

function Language:getWikidataItem()
	local item = self._rawData[2]
	
	if type(item) == "number" then
		return "Q" .. item
	else
		return item
	end
end

function Language:getScripts()
	if not self._scriptObjects then
		local m_scripts = require("Module:scripts")
		self._scriptObjects = {}
		
		for _, sc in ipairs(self:getScriptCodes()) do
			table.insert(self._scriptObjects, m_scripts.getByCode(sc))
		end
	end
	
	return self._scriptObjects
end

function Language:getScriptCodes()
	return self._rawData.scripts or self._rawData[4] or {"None"}
end

function Language:getFamily()
	if self._familyObject then
		return self._familyObject
	end
		
	local family = self._rawData[3] or self._rawData.family 
	if family then
		self._familyObject = require("Module:families").getByCode(family)
	end
	
	return self._familyObject
end

function Language:getAncestors()
	if not self._ancestorObjects then
		self._ancestorObjects = {}
		
		if self._rawData.ancestors then
			for _, ancestor in ipairs(self._rawData.ancestors) do
				table.insert(self._ancestorObjects, export.getByCode(ancestor) or require("Module:etymology languages").getByCode(ancestor))
			end
		else
			local fam = self:getFamily()
			local protoLang = fam and fam:getProtoLanguage() or nil
			
			-- For the case where the current language is the proto-language
			-- of its family, we need to step up a level higher right from the start.
			if protoLang and protoLang:getCode() == self:getCode() then
				fam = fam:getFamily()
				protoLang = fam and fam:getProtoLanguage() or nil
			end
			
			while not protoLang and not (not fam or fam:getCode() == "qfa-not") do
				fam = fam:getFamily()
				protoLang = fam and fam:getProtoLanguage() or nil
			end
			
			table.insert(self._ancestorObjects, protoLang)
		end
	end
	
	return self._ancestorObjects
end

local function iterateOverAncestorTree(node, func)
	for _, ancestor in ipairs(node:getAncestors()) do
		if ancestor then
			local ret = func(ancestor) or iterateOverAncestorTree(ancestor, func)
			if ret then
				return ret
			end
		end
	end
end

function Language:getAncestorChain()
	if not self._ancestorChain then
		self._ancestorChain = {}
		local step = #self:getAncestors() == 1 and self:getAncestors()[1] or nil
		
		while step do
			table.insert(self._ancestorChain, 1, step)
			step = #step:getAncestors() == 1 and step:getAncestors()[1] or nil
		end
	end
	
	return self._ancestorChain
end

function Language:hasAncestor(otherlang)
	local function compare(ancestor)
		return ancestor:getCode() == otherlang:getCode()
	end
	
	return iterateOverAncestorTree(self, compare) or false
end

function Language:getCategoryName(nocap)
	local name = self:getCanonicalName()
	
	-- If the name already has "language" in it, don't add it.
	if not name:find("[Ll]anguage$") then
		name = name .. " language"
	end
	if not nocap then
		name = mw.getContentLanguage():ucfirst(name)
	end
	return name
end

function Language:makeCategoryLink()
	return "[[:Category:" .. self:getCategoryName() .. "|" .. self:getDisplayForm() .. "]]"
end

function Language:getStandardCharacters()
	return self._rawData.standardChars
end

local function getEntities(text)
	return text
		:gsub("&#(%d+);", mw.ustring.char)
		:gsub("&#x(%x+);", function(cap1) return mw.ustring.char(tonumber(cap1, 16)) end)
end

local function processCarets(text)
	local u = mw.ustring.char
	return text
		:gsub("\\\\^", u(0xE000) .. "^")
		:gsub("\\^", u(0xE001))
		:gsub("%^", "")
		:gsub(u(0xE000), "\\")
		:gsub(u(0xE001), "^")
end

local function escapeUnsupportedTitle(text)
	text = {mw.ustring.codepoint(text, 1, mw.ustring.len(text))}
	for j, char in ipairs(text) do
		text[j] = "&#" .. char .. ";"
	end
	return table.concat(text)
end

function Language:makeEntryName(text, sc, unsupportedTitle)
	if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
	-- Strip bold and soft hyphens.
	text = text
		:gsub("('*)'''(.-'*)'''", "%1%2")
		:gsub("­", "")
	text = mw.text.unstrip(text)
	-- Don't strip italics, as that would allow people to use it instead of {{m}} etc.
	
	text = sc:fixDiscouragedSequences(text)
	text = sc:toFixedNFD(text)
	
	-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
	if not sc:hasCapitalization() and sc:getCode() ~= "None" and text:find("%^") then
		text = processCarets(text)
	end
	-- Deal with interwiki link prefixes.
	if text:find(":") then
		local u = mw.ustring.char
		text = text
			:gsub("\\\\:", u(0xE000) .. ":")
			:gsub("\\:", u(0xE001))
			:gsub(u(0xE000), "\\")
		if not text:find("^:") then
			text = do_replacements(text, self, sc, self._rawData.entry_name, "makeEntryName")
		else
			text = text:gsub("^:", "")
		end
		if text:find(u(0xE001)) then
			text = text:gsub(u(0xE001), ":")
		end
	else
		text = do_replacements(text, self, sc, self._rawData.entry_name, "makeEntryName")
	end
	
	local unsupportedTitle = mw.loadData("Module:links/data").unsupported_titles[text]
	if unsupportedTitle then
		return "Unsupported titles/" .. unsupportedTitle, true
	end
	
	return mw.ustring.match(text, "^[¿¡]?(.-[^%s%p].-)%s*[؟?!;՛՜ ՞ ՟?!︖︕।॥။၊་།]?$") or text
end

-- Generates alternative forms using a specified method, and returns them as a table. If no method is specified, returns a table containing only the input term.
function Language:generateForms(text, sc)
	if self._rawData.generate_forms then
		if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
		return require("Module:" .. self._rawData.generate_forms).generateForms(text, self:getCode(), sc:getCode())
	else
		return {text}
	end
end

-- Return true if the language has display processing enabled, i.e. lang:makeDisplayText()
-- does non-trivial processing.
function Language:hasDisplayProcessing()
	return not not self._rawData.display_text
end

-- Apply display-text replacements to `text`, if any.
function Language:makeDisplayText(text, sc, unsupportedTitle, keepCarets, keepColons)
	-- Temporarily convert strip markers to PUA characters to prevent them from being disrupted by the substitution process.
	local u, i, stripMarkers = mw.ustring.char, 1, {}
	for stripMarker in text:gmatch(u(0x7F) .. "'\"`UNIQ%-%-%l+%-%x+%-QINU`\"'" .. u(0x7F)) do
		stripMarkers[i] = stripMarker
		stripMarker = stripMarker:gsub("-", "%%-")
		text = text:gsub(stripMarker, u(0xFF700+i), 1)
		i = i + 1
	end
	
	text = unsupportedTitle and text or getEntities(text)
	
	if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
	
	text = sc:fixDiscouragedSequences(text)
	text = sc:toFixedNFD(text)
	text = do_replacements(text, self, sc, self._rawData.display_text, "makeDisplayText")
	
	-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
	if not keepCarets and not sc:hasCapitalization() and sc:getCode() ~= "None" and text:find("%^") then
		text = processCarets(text)
	end
	-- Remove any interwiki link prefixes (unless they have been escaped or this has been disabled).
	if text:find(":") and not keepColons then
		local u = mw.ustring.char
		text = text
			:gsub("\\\\:", u(0xE000) .. ":")
			:gsub("\\:", u(0xE001))
		local prefix = text:match("^([^:]*):") or ""
		local interwikis = mw.loadData("Module:languages/shareddata").interwikis
		if interwikis[prefix] and text:find(":") ~= #text then
			-- Remove prefix plus colon.
			text = text:gsub("^" .. prefix .. ":(.*)", "%1")
			-- If, additionally, there's a language code after the interwiki link, strip that too.
			if text:find(":") ~= #text then text = text:gsub("^[^:]*:(.*)", "%1") end
		end
		text = text
			:gsub(u(0xE000), "\\")
			:gsub(u(0xE001), ":")
	end
	
	text = unsupportedTitle and escapeUnsupportedTitle(text) or text
	
	for j = 1, #stripMarkers do
		text = text
			:gsub(u(0xFF700+j), stripMarkers[j])
			:gsub("&#" .. 0xFF700+j .. ";", stripMarkers[j])
	end
	
	return text
end

function Language:makeSortKey(text, sc, unsupportedTitle)
	text = unsupportedTitle and text or getEntities(text)
	if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
	-- Remove initial hyphens and *.
	text = mw.text.unstrip(text)
	text = mw.ustring.gsub(text, "^[-־ـ᠊*]+(.)", "%1")
	
	--Normalize.
	text = sc:fixDiscouragedSequences(text)
	text = sc:toFixedNFD(text)
	
	-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
	if not sc:hasCapitalization() and sc:getCode() ~= "None" and text:find("%^") then
		text = processCarets(text)
	end
	
	-- For languages with dotted dotless i, ensure that "İ" is sorted as "i", and "I" is sorted as "ı".
	if self._rawData.dotted_dotless_i then
		text = text
			:gsub(mw.ustring.toNFD("İ"), "i")
			:gsub("I", "ı")
		text = sc:toFixedNFD(text)
	end
	-- Convert to lowercase, make the sortkey, then convert to uppercase. Where the language has dotted dotless i, it is usually not necessary to convert "i" to "İ" and "ı" to "I" first, because "I" will always be interpreted as conventional "I" (not dotless "İ") by any sorting algorithms, which will have been taken into account by the sortkey substitutions themselves. However, if no sortkey substitutions have been specified, then conversion is necessary so as to prevent "i" and "ı" both being sorted as "I".
	text = mw.ustring.lower(text)
	text = do_replacements(text, self, sc, self._rawData.sort_key, "makeSortKey")
	if self._rawData.dotted_dotless_i and not self._rawData.sort_key then
		text = text
			:gsub("ı", "I")
			:gsub("i", "İ")
		text = sc:toFixedNFC(text)
	end
	text = mw.ustring.upper(text)
	
	-- Remove parentheses, as long as they are either preceded or followed by something.
	text = text
		:gsub("(.)[()]+", "%1")
		:gsub("[()]+(.)", "%1")
		
	return unsupportedTitle and escapeUnsupportedTitle(text) or text
end

function Language:transliterate(text, sc, module_override)
	if not ((module_override or self._rawData.translit_module) and text) then
		return nil
	end
	
	if module_override then
		require("Module:debug").track("module_override")
	end
	
	-- Get the display form.
	if not sc or sc._type ~= "script object" then sc = require("Module:scripts").findBestScript(text, self) end
	text = mw.text.unstrip(text)
	text = self:makeDisplayText(text, sc, nil, true)
	
	-- Transliterate.
	text = require("Module:" .. (module_override or self._rawData.translit_module)).tr(text, self:getCode(), sc:getCode())
	
	-- If the script does not use capitalization, then capitalize any letters of the transliteration which are immediately preceded by a caret, and then remove it. Otherwise, just return the text.
	if text and not sc:hasCapitalization() and sc:getCode() ~= "None" and text:find("%^") then
		local u = mw.ustring.char
		text = text
			:gsub("\\\\^", u(0xE000) .. "^")
			:gsub("\\^", u(0xE001))
		return mw.ustring.gsub(text, "%^(%l)", mw.ustring.upper)
			:gsub(u(0xE000), "\\")
			:gsub(u(0xE001), "^")
	else
		return text
	end
end

function Language:overrideManualTranslit()
	return not not self._rawData.override_translit
end

function Language:hasTranslit()
	return self._rawData.translit_module and true or false
end

function Language:link_tr()
	return self._rawData.link_tr and true or false
end

function Language:toJSON()
	local entryNamePatterns = nil
	local entryNameRemoveDiacritics = nil
	
	if self._rawData.entry_name then
		entryNameRemoveDiacritics = self._rawData.entry_name.remove_diacritics
		if self._rawData.entry_name.from then
			entryNamePatterns = {}
			for i, from in ipairs(self._rawData.entry_name.from) do
				table.insert(entryNamePatterns, {from = from, to = self._rawData.entry_name.to[i] or ""})
			end
		end
	end
	
	local ret = {
		ancestors = self._rawData.ancestors,
		canonicalName = self:getCanonicalName(),
		categoryName = self:getCategoryName("nocap"),
		code = self:getCode(),
		entryNamePatterns = entryNamePatterns,
		entryNameRemoveDiacritics = entryNameRemoveDiacritics,
		family = self._rawData[3] or self._rawData.family,
		otherNames = self:getOtherNames(true),
		aliases = self:getAliases(),
		varieties = self:getVarieties(),
		scripts = self._rawData.scripts or self._rawData[4],
		type = self:getType(),
		wikimediaLanguages = self._rawData.wikimedia_codes,
		wikidataItem = self:getWikidataItem(),
	}
	
	return require("Module:JSON").toJSON(ret)
end

-- Do NOT use these methods!
-- All uses should be pre-approved on the talk page!
function Language:getRawData()
	return self._rawData
end

function Language:getRawExtraData()
	self:loadInExtraData()
	return self._extraData
end

Language.__index = Language

function export.getDataModuleName(code)
	if code:find("^%l%l$") then
		return "languages/data2"
	elseif code:find("^%l%l%l$") then
		local prefix = code:sub(1, 1)
		return "languages/data3/" .. prefix
	elseif code:find("^[%l-]+$") then
		return "languages/datax"
	else
		return nil
	end
end


function export.getExtraDataModuleName(code)
	if code:find("^%l%l$") then
		return "languages/extradata2"
	elseif code:find("^%l%l%l$") then
		local prefix = code:sub(1, 1)
		return "languages/extradata3/" .. prefix
	elseif code:find("^[%l-]+$") then
		return "languages/extradatax"
	else
		return nil
	end
end

local function getRawLanguageData(code)
	local modulename = export.getDataModuleName(code)
	return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end

local function getRawExtraLanguageData(code)
	local modulename = export.getExtraDataModuleName(code)
	return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end

function Language:loadInExtraData()
	if not self._extraData then
		-- load extra data from module and assign to _extraData field
		-- use empty table as a fallback if extra data is nil
		self._extraData = getRawExtraLanguageData(self:getCode()) or {}
	end
end

function export.makeObject(code, data)
	if data and data.deprecated then
		require("Module:debug").track {
			"languages/deprecated",
			"languages/deprecated/" .. code
		}
	end
	
	return data and setmetatable({_rawData = data, _code = code, _type = "language object"}, Language) or nil
end

function export.getByCode(code, paramForError, allowEtymLang, allowFamily)
	if type(code) ~= "string" then
		error("The function getByCode expects a string as its first argument, but received " .. (code == nil and "nil" or "a " .. type(code)) .. ".")
	end
	
	local retval = export.makeObject(code, getRawLanguageData(code))
	if not retval and allowEtymLang then
		retval = require("Module:etymology languages").getByCode(code)
	end
	if not retval and allowFamily then
		retval = require("Module:families").getByCode(code)
	end
	if not retval and paramForError then
		require("Module:languages/errorGetBy").code(code, paramForError, allowEtymLang, allowFamily)
	end
	return retval
end

function export.getByName(name, errorIfInvalid)
	local byName = mw.loadData("Module:languages/by name")
	local code = byName.all and byName.all[name] or byName[name]
	
	if not code then
		if errorIfInvalid then
			error("The language name \"" .. name .. "\" is not valid. See [[Wiktionary:List of languages]].")
		else
			return nil
		end
	end
	
	return export.makeObject(code, getRawLanguageData(code))
end

function export.getByCanonicalName(name, errorIfInvalid, allowEtymLang, allowFamily)
	local byName = mw.loadData("Module:languages/canonical names")
	local code = byName and byName[name]

	local retval = code and export.makeObject(code, getRawLanguageData(code)) or nil
	if not retval and allowEtymLang then
		retval = require("Module:etymology languages").getByCanonicalName(name)
	end
	if not retval and allowFamily then
		local famname = name:match("^(.*) languages$")
		famname = famname or name
		retval = require("Module:families").getByCanonicalName(famname)
	end
	if not retval and errorIfInvalid then
		require("Module:languages/errorGetBy").canonicalName(name, allowEtymLang, allowFamily)
	end
	return retval
end

--[[	If language is an etymology language, iterates through parent languages
		until it finds a non-etymology language. ]]
function export.getNonEtymological(lang)
	while lang:getType() == "etymology language" do
		local parentCode = lang:getParentCode()
		lang = export.getByCode(parentCode)
			or require("Module:etymology languages").getByCode(parentCode)
			or require("Module:families").getByCode(parentCode)
	end
	
	return lang
end

-- for backwards compatibility only; modules should require the /error themselves
function export.err(lang_code, param, code_desc, template_tag, not_real_lang)
	return require("Module:languages/error")(lang_code, param, code_desc, template_tag, not_real_lang)
end

return export