1
0

y_punycode.inc 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597
  1. #if defined _INC_y_punycode
  2. #endinput
  3. #endif
  4. #define _INC_y_punycode
  5. /**
  6. * <library name="y_punycode">
  7. * <section>
  8. * Description
  9. * </section>
  10. * Functions for converting unicode strings to and from punycode, to be
  11. * represented in just ASCII characters. Based on several public
  12. * implementations and the RFC, adapted for PAWN. For more information see:
  13. *
  14. * <a href="https://en.wikipedia.org/wiki/Punycode" />
  15. *
  16. * Also includes a function that hooks the "HTTP" function to allow for
  17. * internationalised domain names with that function.
  18. * <section>
  19. * Version
  20. * </section>
  21. * 0.1
  22. * <section>
  23. * Functions
  24. * </section>
  25. * <subsection>
  26. * Stock
  27. * </subsection><ul>
  28. * <symbol name="Puny_Encode">Convert a Unicode string to Punycode.</symbol>
  29. * <symbol name="Puny_Decode">Convert a Punycode string to Unicode.</symbol>
  30. * <symbol name="Puny_HTTP">Wrapper for "HTTP" to encode domain names.</symbol>
  31. * </ul>
  32. * </library>
  33. *//** *//*
  34. Legal:
  35. Version: MPL 1.1
  36. The contents of this file are subject to the Mozilla Public License Version
  37. 1.1 the "License"; you may not use this file except in compliance with
  38. the License. You may obtain a copy of the License at
  39. http://www.mozilla.org/MPL/
  40. Software distributed under the License is distributed on an "AS IS" basis,
  41. WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  42. for the specific language governing rights and limitations under the
  43. License.
  44. The Original Code is the YSI framework.
  45. The Initial Developer of the Original Code is Alex "Y_Less" Cole.
  46. Portions created by the Initial Developer are Copyright C 2011
  47. the Initial Developer. All Rights Reserved.
  48. Contributors:
  49. Y_Less
  50. koolk
  51. JoeBullet/Google63
  52. g_aSlice/Slice
  53. Misiur
  54. samphunter
  55. tianmeta
  56. maddinat0r
  57. spacemud
  58. Crayder
  59. Dayvison
  60. Ahmad45123
  61. Zeex
  62. irinel1996
  63. Yiin-
  64. Chaprnks
  65. Konstantinos
  66. Masterchen09
  67. Southclaws
  68. PatchwerkQWER
  69. m0k1
  70. paulommu
  71. udan111
  72. Thanks:
  73. JoeBullet/Google63 - Handy arbitrary ASM jump code using SCTRL.
  74. ZeeX - Very productive conversations.
  75. koolk - IsPlayerinAreaEx code.
  76. TheAlpha - Danish translation.
  77. breadfish - German translation.
  78. Fireburn - Dutch translation.
  79. yom - French translation.
  80. 50p - Polish translation.
  81. Zamaroht - Spanish translation.
  82. Los - Portuguese translation.
  83. Dracoblue, sintax, mabako, Xtreme, other coders - Producing other modes for
  84. me to strive to better.
  85. Pixels^ - Running XScripters where the idea was born.
  86. Matite - Pestering me to release it and using it.
  87. Very special thanks to:
  88. Thiadmer - PAWN, whose limits continue to amaze me!
  89. Kye/Kalcor - SA:MP.
  90. SA:MP Team past, present and future - SA:MP.
  91. Optional plugins:
  92. Gamer_Z - GPS.
  93. Incognito - Streamer.
  94. Me - sscanf2, fixes2, Whirlpool.
  95. */
  96. #include "..\YSI_Internal\y_compilerdata"
  97. #include "..\YSI_Internal\y_version"
  98. #include "..\YSI_Internal\y_pp"
  99. #include "..\YSI_Core\y_debug"
  100. #if !defined HTTP
  101. #include <a_http>
  102. #endif
  103. #define string:
  104. #define PUNY_BASE (36)
  105. #define PUNY_CHAR ('-')
  106. // Some versions use "-1" or "cellmax", the RFC uses "PUNY_BASE".
  107. #define PUNY_INVL PUNY_BASE
  108. static stock const
  109. PUNY_TMIN = 1,
  110. PUNY_TMAX = 26,
  111. PUNY_SKEW = 38,
  112. PUNY_BIAS = 72,
  113. PUNY_INIT = 128,
  114. PUNY_DAMP = 700,
  115. YSI_gscDecoder[128] =
  116. {
  117. PP_LOOP<48>(PUNY_INVL)(,),
  118. // '0' - '9'.
  119. 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
  120. PP_LOOP<7>(PUNY_INVL)(,),
  121. // 'A' - 'Z'.
  122. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
  123. 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
  124. PP_LOOP<6>(PUNY_INVL)(,),
  125. // 'a' - 'z'.
  126. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
  127. 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
  128. PP_LOOP<5>(PUNY_INVL)(,)
  129. };
  130. /*-------------------------------------------------------------------------*//**
  131. * <param name="dst">Where to store the converted string.</param>
  132. * <param name="src">The string to convert.</param>
  133. * <param name="wlen">The length of the destination.</param>
  134. * <param name="delimiter">What character is between the parts.</param>
  135. * <remarks>
  136. * Takes a punycode string and converts it to unicode.
  137. * </remarks>
  138. *//*------------------------------------------------------------------------**/
  139. stock Puny_Decode(string:dst[], string:src[], wlen = sizeof (dst), const delimiter = PUNY_CHAR)
  140. {
  141. new
  142. rlen = strlen(src),
  143. basicEnd = rlen;
  144. while (basicEnd--)
  145. {
  146. if (src[basicEnd] == delimiter) break;
  147. }
  148. // Enough space for the string, and not empty.
  149. if (0 < ++basicEnd < wlen)
  150. {
  151. // Enough space to store the basic string (and no punycode string).
  152. dst[0] = '\0',
  153. strcat(dst, src, basicEnd);
  154. }
  155. else
  156. {
  157. return dst[0] = '\0', strcat(dst, src, wlen), 1;
  158. }
  159. --wlen;
  160. for (
  161. new
  162. n = PUNY_INIT,
  163. bias = PUNY_BIAS,
  164. delta = 0,
  165. codePointsWritten = basicEnd - 1,
  166. pointsRead = basicEnd;
  167. pointsRead != rlen && codePointsWritten != wlen;
  168. )
  169. {
  170. new
  171. oldDelta = delta;
  172. for (new w = 1, k = PUNY_BASE; pointsRead != rlen; k += PUNY_BASE)
  173. {
  174. new
  175. digit = YSI_gscDecoder[src[pointsRead++]];
  176. if (digit == PUNY_BASE || digit > (cellmax - delta) / w) return 0;
  177. delta += digit * w;
  178. new
  179. t = (k <= bias) ? (PUNY_TMIN) : ((k >= bias + PUNY_TMAX) ? (PUNY_TMAX) : (k - bias));
  180. // Find the end of the current code.
  181. if (digit < t) break;
  182. if (w > cellmax / (PUNY_BASE - t)) return 0;
  183. w *= PUNY_BASE - t;
  184. }
  185. bias = Puny_Adapt(delta - oldDelta, ++codePointsWritten, oldDelta == 0);
  186. if (delta / codePointsWritten > cellmax - n) return 0;
  187. static
  188. sTinyString[2];
  189. n += delta / codePointsWritten,
  190. delta %= codePointsWritten,
  191. sTinyString[0] = n,
  192. strins(dst, sTinyString, delta++, wlen + 1);
  193. }
  194. return 1;
  195. }
  196. /*-------------------------------------------------------------------------*//**
  197. * <param name="dst">Where to store the converted string.</param>
  198. * <param name="src">The string to convert.</param>
  199. * <param name="wlen">The length of the destination.</param>
  200. * <param name="delimiter">What character to place between the parts.</param>
  201. * <remarks>
  202. * Takes a unicode string and converts it to punycode.
  203. * </remarks>
  204. *//*------------------------------------------------------------------------**/
  205. stock Puny_Encode(string:dst[], string:src[], wlen = sizeof (dst), const delimiter = PUNY_CHAR)
  206. {
  207. new
  208. widx,
  209. rlen = strlen(src);
  210. --wlen;
  211. for (new ridx = 0; ridx != rlen; ++ridx)
  212. {
  213. if ('\0' < src[ridx] <= '~')
  214. {
  215. if (widx == wlen) return (dst[widx] = '\0');
  216. dst[widx++] = src[ridx];
  217. }
  218. }
  219. // Wrote out all the characters.
  220. if (widx == rlen) return (dst[widx] = '\0'), -1;
  221. if (widx < wlen) dst[widx++] = delimiter;
  222. else return (dst[widx] = '\0');
  223. // Set up punycode variables.
  224. for (
  225. new
  226. n = PUNY_INIT,
  227. bias = PUNY_BIAS,
  228. delta = 0,
  229. codePointsWritten = widx - 1,
  230. basicPointsWritten = widx;
  231. codePointsWritten < rlen;
  232. )
  233. {
  234. new
  235. m = cellmax;
  236. for (new ridx = 0; ridx != rlen; ++ridx)
  237. {
  238. if (n <= src[ridx] < m)
  239. {
  240. // Find the lowest Unicode character.
  241. m = src[ridx];
  242. }
  243. }
  244. // Make sure the number isn't too big to encode.
  245. if ((m - n) > (cellmax - delta) / (codePointsWritten + 1)) return (dst[widx] = '\0');
  246. // More punycode state machine.
  247. delta += (m - n) * (codePointsWritten + 1),
  248. n = m;
  249. for (new ridx = 0; ridx != rlen; ++ridx)
  250. {
  251. if (src[ridx] < n)
  252. {
  253. if (++delta == 0) return (dst[widx] = '\0');
  254. }
  255. else if (src[ridx] == n)
  256. {
  257. widx += Puny_EncodeVar(bias, delta, dst[widx], wlen - widx),
  258. ++codePointsWritten,
  259. bias = Puny_Adapt(delta, codePointsWritten, (codePointsWritten == basicPointsWritten)),
  260. delta = 0;
  261. }
  262. }
  263. ++n,
  264. ++delta;
  265. }
  266. return (dst[widx] = '\0'), 1;
  267. }
  268. /*-------------------------------------------------------------------------*//**
  269. * <param name="dst">Where to store the converted string.</param>
  270. * <param name="src">The string to convert.</param>
  271. * <param name="hash">Store the hash value.</param>
  272. * <param name="wlen">The length of the destination.</param>
  273. * <param name="delimiter">What character to place between the parts.</param>
  274. * <returns>
  275. * The length of string read.
  276. * </returns>
  277. * <remarks>
  278. * Takes a unicode string and converts it to punycode, while at the same time
  279. * generating a Bernstein hash of the string. CASE INSENSITIVE.
  280. * </remarks>
  281. *//*------------------------------------------------------------------------**/
  282. stock Puny_EncodeHash(string:dst[], string:src[], &hash, wlen = sizeof (dst), const delimiter = PUNY_CHAR)
  283. {
  284. new
  285. ch,
  286. widx,
  287. rlen,
  288. sSrc[YSI_MAX_STRING],
  289. bPacked = ispacked(src);
  290. --wlen,
  291. hash = -1;
  292. if (bPacked) {
  293. strunpack(sSrc, src);
  294. } else {
  295. strcpy(sSrc, src);
  296. }
  297. for (new bool:bb = true; ; ++rlen)
  298. {
  299. if ((ch = sSrc[rlen]) <= '~')
  300. {
  301. if (ch <= ' ')
  302. {
  303. if (bb)
  304. {
  305. return
  306. dst[widx] = '\0',
  307. rlen;
  308. }
  309. break;
  310. }
  311. ch = tolower(ch),
  312. dst[widx++] = ch,
  313. hash = hash * 33 + ch;
  314. }
  315. else bb = false;
  316. }
  317. // Wrote out all the characters.
  318. if (widx >= wlen) return (bPacked ? strpack(dst, dst, wlen) : 0), (dst[widx] = '\0'), rlen;
  319. dst[widx++] = delimiter,
  320. hash = hash * 33 + delimiter;
  321. // Set up punycode variables.
  322. for (
  323. new
  324. n = PUNY_INIT,
  325. bias = PUNY_BIAS,
  326. delta = 0,
  327. codePointsWritten = widx - 1,
  328. basicPointsWritten = widx;
  329. codePointsWritten < rlen;
  330. )
  331. {
  332. new
  333. m = cellmax;
  334. for (new ridx = 0; ridx != rlen; ++ridx)
  335. {
  336. ch = tolower(sSrc[ridx]);
  337. if (n <= ch < m)
  338. {
  339. // Find the lowest Unicode character.
  340. m = ch;
  341. }
  342. }
  343. // Make sure the number isn't too big to encode.
  344. if ((m - n) > (cellmax - delta) / (codePointsWritten + 1)) return (bPacked ? strpack(dst, dst, wlen) : 0), (dst[widx] = '\0'), rlen;
  345. // More punycode state machine.
  346. delta += (m - n) * (codePointsWritten + 1),
  347. n = m;
  348. for (new ridx = 0; ridx != rlen; ++ridx)
  349. {
  350. ch = tolower(sSrc[ridx]);
  351. if (ch < n)
  352. {
  353. if (++delta == 0) return (bPacked ? strpack(dst, dst, wlen) : 0), (dst[widx] = '\0'), rlen;
  354. }
  355. else if (ch == n)
  356. {
  357. widx += Puny_EncodeVarHash(bias, delta, dst[widx], wlen - widx, hash),
  358. ++codePointsWritten,
  359. bias = Puny_Adapt(delta, codePointsWritten, (codePointsWritten == basicPointsWritten)),
  360. delta = 0;
  361. }
  362. }
  363. ++n,
  364. ++delta;
  365. }
  366. return (bPacked ? strpack(dst, dst, wlen) : 0), (dst[widx] = '\0'), rlen;
  367. }
  368. /*-------------------------------------------------------------------------*//**
  369. * <param name="num">The single number to encode.</param>
  370. * <remarks>
  371. * Convert a single digit to base 36.
  372. * </remarks>
  373. *//*------------------------------------------------------------------------**/
  374. P:D(_Puny_Basic(num));
  375. #define _Puny_Basic(%0) (((%0) > 25) ? ((%0) + ('0' - 25)) : ((%0) + 'a'))
  376. /*-------------------------------------------------------------------------*//**
  377. * <param name="bias">Part of the state machine.</param>
  378. * <param name="delta">Part of the state machine.</param>
  379. * <param name="dst">Array to write to.</param>
  380. * <param name="wlen">Size of the array.</param>
  381. * <param name="hash">Hashed string.</param>
  382. * <remarks>
  383. * This is part of how the punycode algorithm encodes numbers as very clever
  384. * strings, but honestly I don't fully understand it!
  385. * </remarks>
  386. *//*------------------------------------------------------------------------**/
  387. static stock Puny_EncodeVarHash(bias, delta, dst[], wlen, &hash)
  388. {
  389. new
  390. i = 0,
  391. k = PUNY_BASE,
  392. t;
  393. while (i < wlen)
  394. {
  395. if (k <= bias) t = PUNY_TMIN;
  396. else if (k >= bias + PUNY_TMAX) t = PUNY_TMAX;
  397. else t = k - bias;
  398. // Find the last digit below the threshold.
  399. if (delta < t) break;
  400. new
  401. c = t + (delta - t) % (PUNY_BASE - t);
  402. dst[i] = _Puny_Basic(c),
  403. hash = hash * 33 + dst[i++],
  404. delta = (delta - t) / (PUNY_BASE - t),
  405. k += PUNY_BASE;
  406. }
  407. if (i < wlen)
  408. {
  409. dst[i] = _Puny_Basic(delta),
  410. hash = hash * 33 + dst[i++];
  411. }
  412. return i;
  413. }
  414. /*-------------------------------------------------------------------------*//**
  415. * <param name="bias">Part of the state machine.</param>
  416. * <param name="delta">Part of the state machine.</param>
  417. * <param name="dst">Array to write to.</param>
  418. * <param name="wlen">Size of the array.</param>
  419. * <remarks>
  420. * This is part of how the punycode algorithm encodes numbers as very clever
  421. * strings, but honestly I don't fully understand it!
  422. * </remarks>
  423. *//*------------------------------------------------------------------------**/
  424. static stock Puny_EncodeVar(bias, delta, dst[], wlen)
  425. {
  426. new
  427. i = 0,
  428. k = PUNY_BASE,
  429. t;
  430. while (i < wlen)
  431. {
  432. if (k <= bias) t = PUNY_TMIN;
  433. else if (k >= bias + PUNY_TMAX) t = PUNY_TMAX;
  434. else t = k - bias;
  435. // Find the last digit below the threshold.
  436. if (delta < t) break;
  437. new
  438. c = t + (delta - t) % (PUNY_BASE - t);
  439. dst[i++] = _Puny_Basic(c),
  440. delta = (delta - t) / (PUNY_BASE - t),
  441. k += PUNY_BASE;
  442. }
  443. if (i < wlen) dst[i++] = _Puny_Basic(delta);
  444. return i;
  445. }
  446. /*-------------------------------------------------------------------------*//**
  447. * <param name="delta">Part of the state machine.</param>
  448. * <param name="length">Written string size.</param>
  449. * <param name="firstTime">Have special characters already been written?</param>
  450. * <remarks>
  451. * This is part of how the punycode algorithm encodes numbers as very clever
  452. * strings, but honestly I don't fully understand it!
  453. * </remarks>
  454. *//*------------------------------------------------------------------------**/
  455. static stock Puny_Adapt(delta, length, bool:firstTime)
  456. {
  457. if (firstTime) delta /= PUNY_DAMP;
  458. else delta >>>= 1;
  459. delta += delta / length;
  460. new
  461. k = 0;
  462. while (delta > (PUNY_BASE - PUNY_TMIN) * PUNY_TMAX >> 1)
  463. {
  464. delta /= PUNY_BASE - PUNY_TMIN,
  465. k += PUNY_BASE;
  466. }
  467. return k + (PUNY_BASE - PUNY_TMIN + 1) * delta / (delta + PUNY_SKEW);
  468. }
  469. /*-------------------------------------------------------------------------*//**
  470. * <param name="index">The HTTP reference index.</param>
  471. * <param name="type">How the request should be sent.</param>
  472. * <param name="url">The (internationalised) URL address.</param>
  473. * <param name="data">The GET/POST data.</param>
  474. * <param name="callback">Which function to return the data to.</param>
  475. * <remarks>
  476. * Hooks the "HTTP" function.
  477. * </remarks>
  478. *//*------------------------------------------------------------------------**/
  479. #if defined PUNY_HTTP_HOOK
  480. stock Puny_HTTP(index, type, url[], data[], callback[])
  481. {
  482. static
  483. sPart[64], // Maximum legal domain part length.
  484. sEncoded[256]; // Maximum legal hostname length.
  485. new
  486. idx = strfind(url, !"://");
  487. // Skip any prefix.
  488. if (idx != -1) idx += 2;
  489. // Add the protocol.
  490. sEncoded[0] = '\0',
  491. strcat(sEncoded, url, idx + 2);
  492. // Encode all parts.
  493. new
  494. prev = idx + 1,
  495. end = strfind(url, !"/", false, prev);
  496. if (end == -1) end = strlen(url); // Nothing after the main domain.
  497. do
  498. {
  499. // Find the size of one part.
  500. idx = strfind(url, !".", false, prev);
  501. // Only encode the domain part.
  502. if (!(-1 < idx < end)) idx = end;
  503. static
  504. ch;
  505. // There's no length parameter for "Puny_Encode", so we need a limit.
  506. ch = url[idx],
  507. url[idx] = sPart[0] = '\0';
  508. switch (Puny_Encode(sPart, url[prev]))
  509. {
  510. // Encoding error.
  511. case 0: return 0;
  512. // Encoded something, add the prefix.
  513. case 1:
  514. {
  515. // The hyphen at the start is the only one - no latin chars.
  516. if (sPart[0] == '-' && strfind(sPart, !"-", false, 1) == -1) format(sEncoded, sizeof (sEncoded), "%sxn-%s%c", sEncoded, sPart, ch);
  517. else format(sEncoded, sizeof (sEncoded), "%sxn--%s%c", sEncoded, sPart, ch);
  518. #if defined _DEBUG
  519. #if _DEBUG >= 1
  520. static
  521. sDecoded[64];
  522. Puny_Decode(sDecoded, sPart);
  523. P:5("Puny_HTTP Original: \"%s\", Encoded: \"%s\", Decoded: \"%s\"", url[prev], sPart, sDecoded);
  524. if (strcmp(url[prev], sDecoded)) P:E("Puny_Decode did not match Puny_Encode");
  525. #endif
  526. #endif
  527. }
  528. // No special characters.
  529. case -1: format(sEncoded, sizeof (sEncoded), "%s%s%c", sEncoded, sPart, ch);
  530. }
  531. // Restore the data.
  532. url[idx] = ch,
  533. prev = idx + 1;
  534. }
  535. while (idx < end);
  536. // Add the remainder of the domain.
  537. if (url[end]) strcat(sEncoded, url[end + 1]);
  538. #if defined _DEBUG
  539. P:2("Puny_HTTP Domain: \"%s\" -> \"%s\"", url, sEncoded);
  540. #endif
  541. // Call the original "HTTP".
  542. return HTTP(index, type, sEncoded, data, callback);
  543. }
  544. #if defined _ALS_HTTP
  545. #undef HTTP
  546. #else
  547. native BAD_HTTP(index, type, url[], data[], callback[]) = HTTP;
  548. #define _ALS_HTTP
  549. #endif
  550. #define HTTP Puny_HTTP
  551. #endif
  552. #undef _Puny_Basic
  553. #undef PUNY_BASE
  554. #undef PUNY_CHAR
  555. #if defined YSI_TESTS
  556. #include "..\YSI_Core\y_testing"
  557. #include "y_punycode/tests"
  558. #endif