Blame SOURCES/wget-1.14-support-non-ASCII-characters.patch

226bdc
From 0a33fa22c597234ab133f63127b4a5e00cf048b9 Mon Sep 17 00:00:00 2001
226bdc
From: Tomas Hozza <thozza@redhat.com>
226bdc
Date: Mon, 20 Jun 2016 12:10:38 +0200
226bdc
Subject: [PATCH] Support non-ASCII characters
226bdc
226bdc
Upstream commit 59b920874daa565a1323ffa1e756e80493190686
226bdc
226bdc
Signed-off-by: Tomas Hozza <thozza@redhat.com>
226bdc
---
226bdc
 src/url.c             | 87 +++++++++++++++++++++++++++++++++++++++++++++++++--
226bdc
 tests/Test-ftp-iri.px |  4 +--
226bdc
 2 files changed, 87 insertions(+), 4 deletions(-)
226bdc
226bdc
diff --git a/src/url.c b/src/url.c
226bdc
index 6bca719..d0d9e27 100644
226bdc
--- a/src/url.c
226bdc
+++ b/src/url.c
226bdc
@@ -42,6 +42,11 @@ as that of the covered work.  */
226bdc
 #include "url.h"
226bdc
 #include "host.h"  /* for is_valid_ipv6_address */
226bdc
 
226bdc
+#if HAVE_ICONV
226bdc
+#include <iconv.h>
226bdc
+#include <langinfo.h>
226bdc
+#endif
226bdc
+
226bdc
 #ifdef __VMS
226bdc
 #include "vms.h"
226bdc
 #endif /* def __VMS */
226bdc
@@ -1335,8 +1340,8 @@ UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
226bdc
   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
226bdc
   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
226bdc
 
226bdc
-  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
226bdc
-  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
226bdc
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0, /* 128-143 */
226bdc
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0, /* 144-159 */
226bdc
   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
226bdc
   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
226bdc
 
226bdc
@@ -1456,6 +1461,82 @@ append_uri_pathel (const char *b, const char *e, bool escaped,
226bdc
   TAIL_INCR (dest, outlen);
226bdc
 }
226bdc
 
226bdc
+static char *
226bdc
+convert_fname (const char *fname)
226bdc
+{
226bdc
+  char *converted_fname = (char *)fname;
226bdc
+#if HAVE_ICONV
226bdc
+  const char *from_encoding = opt.encoding_remote;
226bdc
+  const char *to_encoding = opt.locale;
226bdc
+  iconv_t cd;
226bdc
+  size_t len, done, inlen, outlen;
226bdc
+  char *s;
226bdc
+  const char *orig_fname = fname;;
226bdc
+
226bdc
+  /* Defaults for remote and local encodings.  */
226bdc
+  if (!from_encoding)
226bdc
+    from_encoding = "UTF-8";
226bdc
+  if (!to_encoding)
226bdc
+    to_encoding = nl_langinfo (CODESET);
226bdc
+
226bdc
+  cd = iconv_open (to_encoding, from_encoding);
226bdc
+  if (cd == (iconv_t)(-1))
226bdc
+    logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"),
226bdc
+	       quote (from_encoding), quote (to_encoding));
226bdc
+  else
226bdc
+    {
226bdc
+      inlen = strlen (fname);
226bdc
+      len = outlen = inlen * 2;
226bdc
+      converted_fname = s = xmalloc (outlen + 1);
226bdc
+      done = 0;
226bdc
+
226bdc
+      for (;;)
226bdc
+	{
226bdc
+	  if (iconv (cd, &fname, &inlen, &s, &outlen) != (size_t)(-1)
226bdc
+	      && iconv (cd, NULL, NULL, &s, &outlen) != (size_t)(-1))
226bdc
+	    {
226bdc
+	      *(converted_fname + len - outlen - done) = '\0';
226bdc
+	      iconv_close(cd);
226bdc
+	      DEBUGP (("Converted file name '%s' (%s) -> '%s' (%s)\n",
226bdc
+		       orig_fname, from_encoding, converted_fname, to_encoding));
226bdc
+	      xfree (orig_fname);
226bdc
+	      return converted_fname;
226bdc
+	    }
226bdc
+
226bdc
+	  /* Incomplete or invalid multibyte sequence */
226bdc
+	  if (errno == EINVAL || errno == EILSEQ)
226bdc
+	    {
226bdc
+	      logprintf (LOG_VERBOSE,
226bdc
+			 _("Incomplete or invalid multibyte sequence encountered\n"));
226bdc
+	      xfree (converted_fname);
226bdc
+	      converted_fname = (char *)orig_fname;
226bdc
+	      break;
226bdc
+	    }
226bdc
+	  else if (errno == E2BIG) /* Output buffer full */
226bdc
+	    {
226bdc
+	      done = len;
226bdc
+	      len = outlen = done + inlen * 2;
226bdc
+	      converted_fname = xrealloc (converted_fname, outlen + 1);
226bdc
+	      s = converted_fname + done;
226bdc
+	    }
226bdc
+	  else /* Weird, we got an unspecified error */
226bdc
+	    {
226bdc
+	      logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno);
226bdc
+	      xfree (converted_fname);
226bdc
+	      converted_fname = (char *)orig_fname;
226bdc
+	      break;
226bdc
+	    }
226bdc
+	}
226bdc
+      DEBUGP (("Failed to convert file name '%s' (%s) -> '?' (%s)\n",
226bdc
+	       orig_fname, from_encoding, to_encoding));
226bdc
+    }
226bdc
+
226bdc
+    iconv_close(cd);
226bdc
+#endif
226bdc
+
226bdc
+  return converted_fname;
226bdc
+}
226bdc
+
226bdc
 /* Append to DEST the directory structure that corresponds the
226bdc
    directory part of URL's path.  For example, if the URL is
226bdc
    http://server/dir1/dir2/file, this appends "/dir1/dir2".
226bdc
@@ -1582,6 +1663,8 @@ url_file_name (const struct url *u, char *replaced_filename)
226bdc
 
226bdc
   fname = fnres.base;
226bdc
 
226bdc
+  fname = convert_fname (fname);
226bdc
+
226bdc
   /* Check the cases in which the unique extensions are not used:
226bdc
      1) Clobbering is turned off (-nc).
226bdc
      2) Retrieval with regetting.
226bdc
diff --git a/tests/Test-ftp-iri.px b/tests/Test-ftp-iri.px
226bdc
index a4b7fe1..24ac467 100755
226bdc
--- a/tests/Test-ftp-iri.px
226bdc
+++ b/tests/Test-ftp-iri.px
226bdc
@@ -26,12 +26,12 @@ my %urls = (
226bdc
     },
226bdc
 );
226bdc
 
226bdc
-my $cmdline = $WgetTest::WGETPATH . " --local-encoding=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
226bdc
+my $cmdline = $WgetTest::WGETPATH . " --local-encoding=iso-8859-1 --remote-encoding=utf-8 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
226bdc
 
226bdc
 my $expected_error_code = 0;
226bdc
 
226bdc
 my %expected_downloaded_files = (
226bdc
-    "fran${ccedilla_u8}ais.txt" => {
226bdc
+    "fran${ccedilla_l1}ais.txt" => {
226bdc
         content => $francais,
226bdc
     },
226bdc
 );
226bdc
-- 
226bdc
2.5.5
226bdc