diff options
-rw-r--r-- | iconvlite.cpp | 189 | ||||
-rw-r--r-- | iconvlite.h | 78 | ||||
-rwxr-xr-x | index.cpp | 71 | ||||
-rw-r--r-- | test.js | 9 | ||||
-rw-r--r-- | vkext_flex.c | 2 | ||||
-rw-r--r-- | vkext_flex.h | 12 |
6 files changed, 175 insertions, 186 deletions
diff --git a/iconvlite.cpp b/iconvlite.cpp index a81ab70..af4bb4a 100644 --- a/iconvlite.cpp +++ b/iconvlite.cpp @@ -3,7 +3,76 @@ using namespace std; -static void cp2utf1(char *out, const char *in) { +typedef struct ConvLetter { + unsigned char win1251; + int unicode; +} Letter; + +static Letter g_letters[] = { + {0x82, 0x201A}, // SINGLE LOW-9 QUOTATION MARK + {0x83, 0x0453}, // CYRILLIC SMALL LETTER GJE + {0x84, 0x201E}, // DOUBLE LOW-9 QUOTATION MARK + {0x85, 0x2026}, // HORIZONTAL ELLIPSIS + {0x86, 0x2020}, // DAGGER + {0x87, 0x2021}, // DOUBLE DAGGER + {0x88, 0x20AC}, // EURO SIGN + {0x89, 0x2030}, // PER MILLE SIGN + {0x8A, 0x0409}, // CYRILLIC CAPITAL LETTER LJE + {0x8B, 0x2039}, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK + {0x8C, 0x040A}, // CYRILLIC CAPITAL LETTER NJE + {0x8D, 0x040C}, // CYRILLIC CAPITAL LETTER KJE + {0x8E, 0x040B}, // CYRILLIC CAPITAL LETTER TSHE + {0x8F, 0x040F}, // CYRILLIC CAPITAL LETTER DZHE + {0x90, 0x0452}, // CYRILLIC SMALL LETTER DJE + {0x91, 0x2018}, // LEFT SINGLE QUOTATION MARK + {0x92, 0x2019}, // RIGHT SINGLE QUOTATION MARK + {0x93, 0x201C}, // LEFT DOUBLE QUOTATION MARK + {0x94, 0x201D}, // RIGHT DOUBLE QUOTATION MARK + {0x95, 0x2022}, // BULLET + {0x96, 0x2013}, // EN DASH + {0x97, 0x2014}, // EM DASH + {0x99, 0x2122}, // TRADE MARK SIGN + {0x9A, 0x0459}, // CYRILLIC SMALL LETTER LJE + {0x9B, 0x203A}, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + {0x9C, 0x045A}, // CYRILLIC SMALL LETTER NJE + {0x9D, 0x045C}, // CYRILLIC SMALL LETTER KJE + {0x9E, 0x045B}, // CYRILLIC SMALL LETTER TSHE + {0x9F, 0x045F}, // CYRILLIC SMALL LETTER DZHE + {0xA0, 0x00A0}, // NO-BREAK SPACE + {0xA1, 0x040E}, // CYRILLIC CAPITAL LETTER SHORT U + {0xA2, 0x045E}, // CYRILLIC SMALL LETTER SHORT U + {0xA3, 0x0408}, // CYRILLIC CAPITAL LETTER JE + {0xA4, 0x00A4}, // CURRENCY SIGN + {0xA5, 0x0490}, // CYRILLIC CAPITAL LETTER GHE WITH UPTURN + {0xA6, 0x00A6}, // BROKEN BAR + {0xA7, 0x00A7}, // SECTION SIGN + {0xA8, 0x0401}, // CYRILLIC CAPITAL LETTER IO + {0xA9, 0x00A9}, // COPYRIGHT SIGN + {0xAA, 0x0404}, // CYRILLIC CAPITAL LETTER UKRAINIAN IE + {0xAB, 0x00AB}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + {0xAC, 0x00AC}, // NOT SIGN + {0xAD, 0x00AD}, // SOFT HYPHEN + {0xAE, 0x00AE}, // REGISTERED SIGN + {0xAF, 0x0407}, // CYRILLIC CAPITAL LETTER YI + {0xB0, 0x00B0}, // DEGREE SIGN + {0xB1, 0x00B1}, // PLUS-MINUS SIGN + {0xB2, 0x0406}, // CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I + {0xB3, 0x0456}, // CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I + {0xB4, 0x0491}, // CYRILLIC SMALL LETTER GHE WITH UPTURN + {0xB5, 0x00B5}, // MICRO SIGN + {0xB6, 0x00B6}, // PILCROW SIGN + {0xB7, 0x00B7}, // MIDDLE DOT + {0xB8, 0x0451}, // CYRILLIC SMALL LETTER IO + {0xB9, 0x2116}, // NUMERO SIGN + {0xBA, 0x0454}, // CYRILLIC SMALL LETTER UKRAINIAN IE + {0xBB, 0x00BB}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + {0xBC, 0x0458}, // CYRILLIC SMALL LETTER JE + {0xBD, 0x0405}, // CYRILLIC CAPITAL LETTER DZE + {0xBE, 0x0455}, // CYRILLIC SMALL LETTER DZE + {0xBF, 0x0457} // CYRILLIC SMALL LETTER YI +}; + +static void cp1251_to_utf8(char *out, const char *in, size_t len) { static const int table[128] = { 0x82D0,0x83D0,0x9A80E2,0x93D1,0x9E80E2,0xA680E2,0xA080E2,0xA180E2, 0xAC82E2,0xB080E2,0x89D0,0xB980E2,0x8AD0,0x8CD0,0x8BD0,0x8FD0, @@ -22,7 +91,7 @@ static void cp2utf1(char *out, const char *in) { 0x80D1,0x81D1,0x82D1,0x83D1,0x84D1,0x85D1,0x86D1,0x87D1, 0x88D1,0x89D1,0x8AD1,0x8BD1,0x8CD1,0x8DD1,0x8ED1,0x8FD1 }; - while (*in) + while (*in) { if (*in & 0x80) { int v = table[(int)(0x7f & *in++)]; if (!v) @@ -31,75 +100,71 @@ static void cp2utf1(char *out, const char *in) { *out++ = (char)(v >> 8); if (v >>= 16) *out++ = (char)v; - } - else + } else { *out++ = *in++; + } + } *out = 0; } -string cp2utf(string s) { - int c,i; - int len = s.size(); - string ns; - for(i=0; i<len; i++) { - c=s[i]; - char buf[4], in[2] = {0, 0}; - *in = c; - cp2utf1(buf, in); - ns+=string(buf); + +static int utf8_to_cp1251(const char* utf8, char* windows1251, size_t n) { + int i = 0; + int j = 0; + for(; i < (int)n && utf8[i] != 0; ++i) { + char prefix = utf8[i]; + char suffix = utf8[i+1]; + if ((prefix & 0x80) == 0) { + windows1251[j] = (char)prefix; + ++j; + } else if ((~prefix) & 0x20) { + int first5bit = prefix & 0x1F; + first5bit <<= 6; + int sec6bit = suffix & 0x3F; + int unicode_char = first5bit + sec6bit; + + if ( unicode_char >= 0x410 && unicode_char <= 0x44F ) { + windows1251[j] = (char)(unicode_char - 0x350); + } else if (unicode_char >= 0x80 && unicode_char <= 0xFF) { + windows1251[j] = (char)(unicode_char); + } else if (unicode_char >= 0x402 && unicode_char <= 0x403) { + windows1251[j] = (char)(unicode_char - 0x382); + } else { + int count = sizeof(g_letters) / sizeof(Letter); + for (int k = 0; k < count; ++k) { + if (unicode_char == g_letters[k].unicode) { + windows1251[j] = g_letters[k].win1251; + goto NEXT_LETTER; + } + } + // can't convert this char + return 0; + } +NEXT_LETTER: + ++i; + ++j; + } else { + // can't convert this chars + return 0; + } } - return ns; + windows1251[j] = 0; + return 1; } string utf2cp(string s) { size_t len = s.size(); - const char *buff = s.c_str(); - char *output = new char[len]; - convert_utf8_to_windows1251(buff, output, len); + char* output = new char[len+1]; + utf8_to_cp1251(s.c_str(), output, len); string ns(output); + delete[] output; return ns; } -int convert_utf8_to_windows1251(const char* utf8, char* windows1251, size_t n) -{ - int i = 0; - int j = 0; - for(; i < (int)n && utf8[i] != 0; ++i) { - char prefix = utf8[i]; - char suffix = utf8[i+1]; - if ((prefix & 0x80) == 0) { - windows1251[j] = (char)prefix; - ++j; - } else if ((~prefix) & 0x20) { - int first5bit = prefix & 0x1F; - first5bit <<= 6; - int sec6bit = suffix & 0x3F; - int unicode_char = first5bit + sec6bit; - - if ( unicode_char >= 0x410 && unicode_char <= 0x44F ) { - windows1251[j] = (char)(unicode_char - 0x350); - } else if (unicode_char >= 0x80 && unicode_char <= 0xFF) { - windows1251[j] = (char)(unicode_char); - } else if (unicode_char >= 0x402 && unicode_char <= 0x403) { - windows1251[j] = (char)(unicode_char - 0x382); - } else { - int count = sizeof(g_letters) / sizeof(Letter); - for (int k = 0; k < count; ++k) { - if (unicode_char == g_letters[k].unicode) { - windows1251[j] = g_letters[k].win1251; - goto NEXT_LETTER; - } - } - // can't convert this char - return 0; - } -NEXT_LETTER: - ++i; - ++j; - } else { - // can't convert this chars - return 0; - } - } - windows1251[j] = 0; - return 1; -}
\ No newline at end of file +string cp2utf(string s) { + size_t len = s.size(); + char* output = new char[len*3+1]; + cp1251_to_utf8(output, s.c_str(), len); + string ns(output); + delete[] output; + return ns; +} diff --git a/iconvlite.h b/iconvlite.h index c2bb6a3..3194eaf 100644 --- a/iconvlite.h +++ b/iconvlite.h @@ -1,85 +1,9 @@ -/* -iconvlite.h -Iconv Lite -Simple cpp functions to convert strings from cp1251 to utf8 and ftom utf8 to cp1251 -*/ - #ifndef ICONVLITE_H #define ICONVLITE_H using namespace std; string cp2utf(string s); -int convert_utf8_to_windows1251(const char* utf8, char* windows1251, size_t n); string utf2cp(string s); -typedef struct ConvLetter { - unsigned char win1251; - int unicode; -} Letter; - -static Letter g_letters[] = { - {0x82, 0x201A}, // SINGLE LOW-9 QUOTATION MARK - {0x83, 0x0453}, // CYRILLIC SMALL LETTER GJE - {0x84, 0x201E}, // DOUBLE LOW-9 QUOTATION MARK - {0x85, 0x2026}, // HORIZONTAL ELLIPSIS - {0x86, 0x2020}, // DAGGER - {0x87, 0x2021}, // DOUBLE DAGGER - {0x88, 0x20AC}, // EURO SIGN - {0x89, 0x2030}, // PER MILLE SIGN - {0x8A, 0x0409}, // CYRILLIC CAPITAL LETTER LJE - {0x8B, 0x2039}, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK - {0x8C, 0x040A}, // CYRILLIC CAPITAL LETTER NJE - {0x8D, 0x040C}, // CYRILLIC CAPITAL LETTER KJE - {0x8E, 0x040B}, // CYRILLIC CAPITAL LETTER TSHE - {0x8F, 0x040F}, // CYRILLIC CAPITAL LETTER DZHE - {0x90, 0x0452}, // CYRILLIC SMALL LETTER DJE - {0x91, 0x2018}, // LEFT SINGLE QUOTATION MARK - {0x92, 0x2019}, // RIGHT SINGLE QUOTATION MARK - {0x93, 0x201C}, // LEFT DOUBLE QUOTATION MARK - {0x94, 0x201D}, // RIGHT DOUBLE QUOTATION MARK - {0x95, 0x2022}, // BULLET - {0x96, 0x2013}, // EN DASH - {0x97, 0x2014}, // EM DASH - {0x99, 0x2122}, // TRADE MARK SIGN - {0x9A, 0x0459}, // CYRILLIC SMALL LETTER LJE - {0x9B, 0x203A}, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK - {0x9C, 0x045A}, // CYRILLIC SMALL LETTER NJE - {0x9D, 0x045C}, // CYRILLIC SMALL LETTER KJE - {0x9E, 0x045B}, // CYRILLIC SMALL LETTER TSHE - {0x9F, 0x045F}, // CYRILLIC SMALL LETTER DZHE - {0xA0, 0x00A0}, // NO-BREAK SPACE - {0xA1, 0x040E}, // CYRILLIC CAPITAL LETTER SHORT U - {0xA2, 0x045E}, // CYRILLIC SMALL LETTER SHORT U - {0xA3, 0x0408}, // CYRILLIC CAPITAL LETTER JE - {0xA4, 0x00A4}, // CURRENCY SIGN - {0xA5, 0x0490}, // CYRILLIC CAPITAL LETTER GHE WITH UPTURN - {0xA6, 0x00A6}, // BROKEN BAR - {0xA7, 0x00A7}, // SECTION SIGN - {0xA8, 0x0401}, // CYRILLIC CAPITAL LETTER IO - {0xA9, 0x00A9}, // COPYRIGHT SIGN - {0xAA, 0x0404}, // CYRILLIC CAPITAL LETTER UKRAINIAN IE - {0xAB, 0x00AB}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK - {0xAC, 0x00AC}, // NOT SIGN - {0xAD, 0x00AD}, // SOFT HYPHEN - {0xAE, 0x00AE}, // REGISTERED SIGN - {0xAF, 0x0407}, // CYRILLIC CAPITAL LETTER YI - {0xB0, 0x00B0}, // DEGREE SIGN - {0xB1, 0x00B1}, // PLUS-MINUS SIGN - {0xB2, 0x0406}, // CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I - {0xB3, 0x0456}, // CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I - {0xB4, 0x0491}, // CYRILLIC SMALL LETTER GHE WITH UPTURN - {0xB5, 0x00B5}, // MICRO SIGN - {0xB6, 0x00B6}, // PILCROW SIGN - {0xB7, 0x00B7}, // MIDDLE DOT - {0xB8, 0x0451}, // CYRILLIC SMALL LETTER IO - {0xB9, 0x2116}, // NUMERO SIGN - {0xBA, 0x0454}, // CYRILLIC SMALL LETTER UKRAINIAN IE - {0xBB, 0x00BB}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK - {0xBC, 0x0458}, // CYRILLIC SMALL LETTER JE - {0xBD, 0x0405}, // CYRILLIC CAPITAL LETTER DZE - {0xBE, 0x0455}, // CYRILLIC SMALL LETTER DZE - {0xBF, 0x0457} // CYRILLIC SMALL LETTER YI -}; - -#endif
\ No newline at end of file +#endif @@ -2,52 +2,49 @@ #include <v8.h> #include <iostream> +#include <stdlib.h> #include "iconvlite.h" #include "vkext_flex.h" using namespace v8; -using namespace std; - -const char* ToCString(const String::Utf8Value& value) { - return *value ? *value : "<string conversion failed>"; -} void flex(const FunctionCallbackInfo<Value>& args) { - Isolate* isolate = args.GetIsolate(); - - String::Utf8Value nameArg(args[0]); - string nameString(*nameArg); - string nameStringWindows1251 = utf2cp(nameString); - - String::Utf8Value caseArg(args[2]); - string caseString(*caseArg); - string caseStringWindows1251 = utf2cp(caseString); - - String::Utf8Value typeArg(args[3]); - string typeString(*typeArg); - string ctypeStringWindows1251 = utf2cp(typeString); - - int sex = (int)args[1]->NumberValue(); - int lang = (int)args[4]->NumberValue(); - - char *result = do_flex( - nameStringWindows1251.c_str(), - nameStringWindows1251.length(), - caseStringWindows1251.c_str(), - caseStringWindows1251.length(), - sex, - ctypeStringWindows1251.c_str(), - ctypeStringWindows1251.length(), - lang); - - string resultStringWindows1251(result); - string resultString = cp2utf(resultStringWindows1251); - - args.GetReturnValue().Set(String::NewFromUtf8(isolate, resultString.c_str())); + Isolate* isolate = args.GetIsolate(); + + String::Utf8Value nameArg(args[0]); + std::string nameString(*nameArg); + std::string nameStringWindows1251 = utf2cp(nameString); + + String::Utf8Value caseArg(args[2]); + std::string caseString(*caseArg); + std::string caseStringWindows1251 = utf2cp(caseString); + + String::Utf8Value typeArg(args[3]); + std::string typeString(*typeArg); + std::string ctypeStringWindows1251 = utf2cp(typeString); + + int sex = (int)args[1]->NumberValue(); + int lang = (int)args[4]->NumberValue(); + + char *result = do_flex( + nameStringWindows1251.c_str(), + nameStringWindows1251.length(), + caseStringWindows1251.c_str(), + caseStringWindows1251.length(), + sex, + ctypeStringWindows1251.c_str(), + lang); + + std::string resultStringWindows1251(result); + free(result); + + std::string resultString = cp2utf(resultStringWindows1251); + + args.GetReturnValue().Set(String::NewFromUtf8(isolate, resultString.c_str())); } void Init(Handle<Object> exports) { NODE_SET_METHOD(exports, "flex", flex); } -NODE_MODULE(hello, Init); +NODE_MODULE(vkext_flex, Init) @@ -3,12 +3,14 @@ const vkflex = require('./index.js') let nameCases = ['Gen', 'Dat', 'Acc', 'Ins', 'Abl'] let names = [ ['Евгений', 'Зиновьев', 0], - ['Павел', 'Дуров', 0], - ['Анастасия', 'Семенюк', 1], - ['Катя', 'Лебедева', 1] + //['Павел', 'Дуров', 0], + //['Анастасия', 'Семенюк', 1], + //['Катя', 'Лебедева', 1] + //['Denis', 'Komissarov', 0] ] console.time('flex') +for (let i = 0; i < 100; i++) { for (let [name, surname, sex] of names) { console.log('Testing "'+name+' '+surname+'"...') @@ -18,4 +20,5 @@ for (let [name, surname, sex] of names) { console.log('') } +} console.timeEnd('flex') diff --git a/vkext_flex.c b/vkext_flex.c index 244ac37..b78de49 100644 --- a/vkext_flex.c +++ b/vkext_flex.c @@ -38,7 +38,7 @@ char *estrdup (const char *s) { return d; } -char *do_flex (const char *name, int name_len, const char *case_name, int case_name_len, int sex, const char *type, int type_len, int lang_id) { +char *do_flex (const char *name, int name_len, const char *case_name, int case_name_len, int sex, const char *type, int lang_id) { if (name_len > (1 << 10)) { return estrdup (name); } diff --git a/vkext_flex.h b/vkext_flex.h index 48de3c2..0799748 100644 --- a/vkext_flex.h +++ b/vkext_flex.h @@ -14,7 +14,7 @@ You should have received a copy of the GNU General Public License along with VK/KittenPHP-DB-Engine. If not, see <http://www.gnu.org/licenses/>. - This program is released under the GPL with the additional exemption + This program is released under the GPL with the additional exemption that compiling, linking, and/or using OpenSSL is allowed. You are free to remove this exemption from derived works. @@ -30,10 +30,10 @@ #if defined __cplusplus extern "C" { #endif - + #include <stdio.h> - + struct vk_node { short tail_len; short hyphen; @@ -52,9 +52,9 @@ struct lang { const char **endings; struct vk_node nodes[]; }; - -char *do_flex (const char *name, int name_len, const char *case_name, int case_name_len, int sex, const char *type, int type_len, int lang_id); - + +char *do_flex (const char *name, int name_len, const char *case_name, int case_name_len, int sex, const char *type, int lang_id); + #if defined __cplusplus }; #endif |