liblcf
Loading...
Searching...
No Matches
reader_util.cpp
Go to the documentation of this file.
1/*
2 * This file is part of liblcf. Copyright (c) 2020 liblcf authors.
3 * https://github.com/EasyRPG/liblcf - https://easyrpg.org
4 *
5 * liblcf is Free/Libre Open Source Software, released under the MIT License.
6 * For the full copyright and license information, please view the COPYING
7 * file that was distributed with this source code.
8 */
9
10#include "lcf_options.h"
11#include "scope_guard.h"
12
13#ifdef LCF_SUPPORT_ICU
14# include <unicode/ucsdet.h>
15# include <unicode/ucnv.h>
16# include <unicode/normalizer2.h>
17# include <unicode/unistr.h>
18#else
19# ifdef _MSC_VER
20# error MSVC builds require ICU
21# endif
22#endif
23
24#ifdef _WIN32
25# include <windows.h>
26#else
27# ifndef LCF_SUPPORT_ICU
28# include <iconv.h>
29# endif
30# include <locale>
31#endif
32
33#if defined(__MORPHOS__) || defined(__amigaos4__)
34#define ICONV_CONST const
35#endif
36#include <algorithm>
37#include <cstdio>
38#include <cstdlib>
39#include <sstream>
40#include <vector>
41
42#include "data.h"
43#include "inireader.h"
44#include "ldb_reader.h"
45#include "reader_util.h"
46
47namespace ReaderUtil {
48}
49
50std::string ReaderUtil::CodepageToEncoding(int codepage) {
51 if (codepage == 0)
52 return std::string();
53
54 if (codepage == 932) {
55#ifdef LCF_SUPPORT_ICU
56 return "ibm-943_P15A-2003";
57#else
58 return "SHIFT_JIS";
59#endif
60 }
61 if (codepage == 949) {
62#ifdef LCF_SUPPORT_ICU
63 return "windows-949-2000";
64#else
65 return "cp949";
66#endif
67 }
68 std::ostringstream out;
69#ifdef LCF_SUPPORT_ICU
70 out << "windows-" << codepage;
71#else
72 out << "CP" << codepage;
73#endif
74
75 // Looks like a valid codepage
76 std::string outs = out.str();
77 return outs;
78}
79
80std::string ReaderUtil::DetectEncoding(std::istream& filestream) {
81 std::vector<std::string> encodings = DetectEncodings(filestream);
82
83 if (encodings.empty()) {
84 return "";
85 }
86
87 return encodings.front();
88}
89
90std::string ReaderUtil::DetectEncoding(std::string const & data) {
91 std::vector<std::string> encodings = DetectEncodings(data);
92
93 if (encodings.empty()) {
94 return "";
95 }
96
97 return encodings.front();
98}
99
100std::vector<std::string> ReaderUtil::DetectEncodings(std::istream& filestream) {
101#ifdef LCF_SUPPORT_ICU
102 std::ostringstream text;
103
104 // Populate Data::terms and Data::system or will empty by default even if load fails
105 LDB_Reader::Load(filestream, "");
106
107 text <<
141 Data::terms.no <<
151
152 return ReaderUtil::DetectEncodings(text.str());
153#else
154 return std::vector<std::string>();
155#endif
156}
157
158std::vector<std::string> ReaderUtil::DetectEncodings(std::string const & data) {
159std::vector<std::string> encodings;
160#ifdef LCF_SUPPORT_ICU
161 if (!data.empty()) {
162 UErrorCode status = U_ZERO_ERROR;
163 UCharsetDetector* detector = ucsdet_open(&status);
164
165 std::string s = data;
166 ucsdet_setText(detector, s.c_str(), s.length(), &status);
167
168 int32_t matches_count;
169 const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);
170
171 if (matches != NULL) {
172 // Collect all candidates, most confident comes first
173 for (int i = 0; i < matches_count; ++i) {
174 std::string encoding = ucsdet_getName(matches[i], &status);
175
176 // Fixes to ensure proper Windows encodings
177 if (encoding == "Shift_JIS") {
178 encodings.push_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
179 } else if (encoding == "EUC-KR") {
180 encodings.push_back("windows-949-2000"); // Korean with \ as backlash
181 } else if (encoding == "GB18030") {
182 encodings.push_back("windows-936-2000"); // Simplified Chinese
183 } else if (encoding == "ISO-8859-1" || encoding == "windows-1252") {
184 encodings.push_back("ibm-5348_P100-1997"); // Occidental with Euro
185 } else if (encoding == "ISO-8859-2" || encoding == "windows-1250") {
186 encodings.push_back("ibm-5346_P100-1998"); // Central Europe with Euro
187 } else if (encoding == "ISO-8859-5" || encoding == "windows-1251") {
188 encodings.push_back("ibm-5347_P100-1998"); // Cyrillic with Euro
189 } else if (encoding == "ISO-8859-6" || encoding == "windows-1256") {
190 encodings.push_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
191 } else if (encoding == "ISO-8859-7" || encoding == "windows-1253") {
192 encodings.push_back("ibm-5349_P100-1998"); // Greek with Euro
193 } else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
194 encodings.push_back("ibm-9447_P100-2002"); // Hebrew with Euro
195 } else {
196 encodings.push_back(encoding);
197 }
198 }
199 }
200 ucsdet_close(detector);
201 }
202#endif
203
204 return encodings;
205}
206
207std::string ReaderUtil::GetEncoding(const std::string& ini_file) {
208 INIReader ini(ini_file);
209 if (ini.ParseError() != -1) {
210 std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
211 if (!encoding.empty()) {
212 return ReaderUtil::CodepageToEncoding(atoi(encoding.c_str()));
213 }
214 }
215 return std::string();
216}
217
218std::string ReaderUtil::GetEncoding(std::istream& filestream) {
219 INIReader ini(filestream);
220 if (ini.ParseError() != -1) {
221 std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
222 if (!encoding.empty()) {
223 return ReaderUtil::CodepageToEncoding(atoi(encoding.c_str()));
224 }
225 }
226 return std::string();
227}
228
230#ifdef _WIN32
231 int codepage = GetACP();
232#elif __ANDROID__
233 // No std::locale support in NDK
234 // Doesn't really matter because the Android version auto-detects via ICU
235 int codepage = 1252;
236#else
237 int codepage = 1252;
238
239 std::locale loc = std::locale("");
240 // Gets the language and culture part only
241 std::string loc_full = loc.name().substr(0, loc.name().find_first_of("@."));
242 // Gets the language part only
243 std::string loc_lang = loc.name().substr(0, loc.name().find_first_of("_"));
244
245 if (loc_lang == "th") codepage = 874;
246 else if (loc_lang == "ja") codepage = 932;
247 else if (loc_full == "zh_CN" ||
248 loc_full == "zh_SG") codepage = 936;
249 else if (loc_lang == "ko") codepage = 949;
250 else if (loc_full == "zh_TW" ||
251 loc_full == "zh_HK") codepage = 950;
252 else if (loc_lang == "cs" ||
253 loc_lang == "hu" ||
254 loc_lang == "pl" ||
255 loc_lang == "ro" ||
256 loc_lang == "hr" ||
257 loc_lang == "sk" ||
258 loc_lang == "sl") codepage = 1250;
259 else if (loc_lang == "ru") codepage = 1251;
260 else if (loc_lang == "ca" ||
261 loc_lang == "da" ||
262 loc_lang == "de" ||
263 loc_lang == "en" ||
264 loc_lang == "es" ||
265 loc_lang == "fi" ||
266 loc_lang == "fr" ||
267 loc_lang == "it" ||
268 loc_lang == "nl" ||
269 loc_lang == "nb" ||
270 loc_lang == "pt" ||
271 loc_lang == "sv" ||
272 loc_lang == "eu") codepage = 1252;
273 else if (loc_lang == "el") codepage = 1253;
274 else if (loc_lang == "tr") codepage = 1254;
275 else if (loc_lang == "he") codepage = 1255;
276 else if (loc_lang == "ar") codepage = 1256;
277 else if (loc_lang == "et" ||
278 loc_lang == "lt" ||
279 loc_lang == "lv") codepage = 1257;
280 else if (loc_lang == "vi") codepage = 1258;
281#endif
282
283 return CodepageToEncoding(codepage);
284}
285
286std::string ReaderUtil::Recode(const std::string& str_to_encode, const std::string& source_encoding) {
287 return ReaderUtil::Recode(str_to_encode, source_encoding, "UTF-8");
288}
289
290std::string ReaderUtil::Recode(const std::string& str_to_encode,
291 const std::string& src_enc,
292 const std::string& dst_enc) {
293
294 if (src_enc.empty() || dst_enc.empty() || str_to_encode.empty()) {
295 return str_to_encode;
296 }
297
298 auto src_cp = atoi(src_enc.c_str());
299 const auto& src_enc_str = src_cp > 0
301 : src_enc;
302
303 auto dst_cp = atoi(dst_enc.c_str());
304 const auto& dst_enc_str = dst_cp > 0
306 : dst_enc;
307
308#ifdef LCF_SUPPORT_ICU
309 auto status = U_ZERO_ERROR;
310 auto conv_from = ucnv_open(src_enc_str.c_str(), &status);
311
312 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
313 fprintf(stderr, "liblcf: ucnv_open() error for source encoding \"%s\": %s\n", src_enc_str.c_str(), u_errorName(status));
314 return std::string();
315 }
316 status = U_ZERO_ERROR;
317 auto conv_from_sg = makeScopeGuard([&]() { ucnv_close(conv_from); });
318
319 auto conv_to = ucnv_open(dst_enc_str.c_str(), &status);
320
321 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
322 fprintf(stderr, "liblcf: ucnv_open() error for dest encoding \"%s\": %s\n", dst_enc_str.c_str(), u_errorName(status));
323 return std::string();
324 }
325 auto conv_to_sg = makeScopeGuard([&]() { ucnv_close(conv_to); });
326 status = U_ZERO_ERROR;
327
328 std::string result(str_to_encode.size() * 4, '\0');
329 auto* src = &str_to_encode.front();
330 auto* dst = &result.front();
331
332 ucnv_convertEx(conv_to, conv_from,
333 &dst, dst + result.size(),
334 &src, src + str_to_encode.size(),
335 nullptr, nullptr, nullptr, nullptr,
336 true, true,
337 &status);
338
339 if (U_FAILURE(status)) {
340 fprintf(stderr, "liblcf: ucnv_convertEx() error when encoding \"%s\": %s\n", str_to_encode.c_str(), u_errorName(status));
341 return std::string();
342 }
343
344 result.resize(dst - result.c_str());
345 result.shrink_to_fit();
346
347 return result;
348#else
349 iconv_t cd = iconv_open(dst_enc_str.c_str(), src_enc_str.c_str());
350 if (cd == (iconv_t)-1)
351 return str_to_encode;
352 char *src = const_cast<char *>(str_to_encode.c_str());
353 size_t src_left = str_to_encode.size();
354 size_t dst_size = str_to_encode.size() * 5 + 10;
355 char *dst = new char[dst_size];
356 size_t dst_left = dst_size;
357# ifdef ICONV_CONST
358 char ICONV_CONST *p = src;
359# else
360 char *p = src;
361# endif
362 char *q = dst;
363 size_t status = iconv(cd, &p, &src_left, &q, &dst_left);
364 iconv_close(cd);
365 if (status == (size_t) -1 || src_left > 0) {
366 delete[] dst;
367 return std::string();
368 }
369 *q++ = '\0';
370 std::string result(dst);
371 delete[] dst;
372 return result;
373#endif
374}
375
376std::string ReaderUtil::Normalize(const std::string &str) {
377#ifdef LCF_SUPPORT_ICU
378 icu::UnicodeString uni = icu::UnicodeString(str.c_str(), "utf-8").toLower();
379 UErrorCode err = U_ZERO_ERROR;
380 std::string res;
381 const icu::Normalizer2* norm = icu::Normalizer2::getNFKCInstance(err);
382 if (U_FAILURE(err)) {
383 static bool err_reported = false;
384 if (!err_reported) {
385 fprintf(stderr, "Normalizer2::getNFKCInstance failed (%s). \"nrm\" is probably missing in the ICU data file. Unicode normalization will not work!\n", u_errorName(err));
386 err_reported = true;
387 }
388 uni.toUTF8String(res);
389 return res;
390 }
391 icu::UnicodeString f = norm->normalize(uni, err);
392 if (U_FAILURE(err)) {
393 uni.toUTF8String(res);
394 } else {
395 f.toUTF8String(res);
396 }
397 return res;
398#else
399 std::string result = str;
400 std::transform(result.begin(), result.end(), result.begin(), tolower);
401 return result;
402#endif
403}
std::string Get(const std::string &section, const std::string &name, const std::string &default_value) const
Definition: inireader.cpp:103
int ParseError() const
Definition: inireader.cpp:98
std::string airship_name
Definition: rpg_system.h:180
std::string system_name
Definition: rpg_system.h:186
std::string battletest_background
Definition: rpg_system.h:220
std::string title_name
Definition: rpg_system.h:184
std::string boat_name
Definition: rpg_system.h:178
std::string system2_name
Definition: rpg_system.h:187
std::string frame_name
Definition: rpg_system.h:229
std::string gameover_name
Definition: rpg_system.h:185
std::string ship_name
Definition: rpg_system.h:179
std::string status
Definition: rpg_terms.h:122
std::string row
Definition: rpg_terms.h:123
std::string hp_short
Definition: rpg_terms.h:133
std::string shield
Definition: rpg_terms.h:141
std::string lvl_short
Definition: rpg_terms.h:132
std::string agility
Definition: rpg_terms.h:139
std::string health_points
Definition: rpg_terms.h:128
std::string menu_save
Definition: rpg_terms.h:117
std::string attack
Definition: rpg_terms.h:136
std::string spirit
Definition: rpg_terms.h:138
std::string sp_short
Definition: rpg_terms.h:134
std::string exit_game
Definition: rpg_terms.h:121
std::string sp_cost
Definition: rpg_terms.h:135
std::string armor
Definition: rpg_terms.h:142
std::string no
Definition: rpg_terms.h:150
std::string spirit_points
Definition: rpg_terms.h:129
std::string file
Definition: rpg_terms.h:147
std::string new_game
Definition: rpg_terms.h:119
std::string wait_on
Definition: rpg_terms.h:125
std::string level
Definition: rpg_terms.h:127
std::string save_game_message
Definition: rpg_terms.h:145
std::string exit_game_message
Definition: rpg_terms.h:148
std::string load_game
Definition: rpg_terms.h:120
std::string accessory
Definition: rpg_terms.h:144
std::string helmet
Definition: rpg_terms.h:143
std::string defense
Definition: rpg_terms.h:137
std::string yes
Definition: rpg_terms.h:149
std::string exp_short
Definition: rpg_terms.h:131
std::string normal_status
Definition: rpg_terms.h:130
std::string wait_off
Definition: rpg_terms.h:126
std::string load_game_message
Definition: rpg_terms.h:146
std::string menu_quit
Definition: rpg_terms.h:118
std::string order
Definition: rpg_terms.h:124
std::string weapon
Definition: rpg_terms.h:140
RPG::Terms & terms
Definition: data.cpp:30
RPG::System & system
Definition: data.cpp:31
bool Load(const std::string &filename, const std::string &encoding)
Definition: ldb_reader.cpp:24
std::string DetectEncoding(std::istream &filestream)
Definition: reader_util.cpp:80
std::string CodepageToEncoding(int codepage)
Definition: reader_util.cpp:50
std::vector< std::string > DetectEncodings(std::istream &filestream)
std::string GetLocaleEncoding()
std::string Normalize(const std::string &str)
std::string GetEncoding(const std::string &ini_file)
std::string Recode(const std::string &str_to_encode, const std::string &source_encoding)
ScopeGuard< F > makeScopeGuard(F &&f)
Definition: scope_guard.h:39