]> pilppa.com Git - libcharencoding.git/commitdiff
libcharencoding: initial version
authorMika Laitio <lamikr@iiris.(none)>
Sun, 7 Feb 2010 21:52:44 +0000 (23:52 +0200)
committerMika Laitio <lamikr@iiris.(none)>
Sun, 7 Feb 2010 21:52:44 +0000 (23:52 +0200)
20 files changed:
AUTHORS [new file with mode: 0644]
COPYING [new file with mode: 0644]
ChangeLog [new file with mode: 0644]
INSTALL [new file with mode: 0644]
Makefile.am [new file with mode: 0644]
NEWS [new file with mode: 0644]
README [new file with mode: 0644]
autobuild.sh [new file with mode: 0755]
autoclean.sh [new file with mode: 0755]
configure.ac [new file with mode: 0644]
libcharencoding.pc.in [new file with mode: 0644]
src/Makefile.am [new file with mode: 0644]
src/charencoding.c [new file with mode: 0644]
src/charencoding.h [new file with mode: 0644]
src/internal/charencoding_internal.c [new file with mode: 0644]
src/internal/charencoding_internal.h [new file with mode: 0644]
src/internal/utf8.c [new file with mode: 0644]
src/internal/utf8.h [new file with mode: 0644]
src_test/Makefile.am [new file with mode: 0644]
src_test/libcharencoding_test.c [new file with mode: 0644]

diff --git a/AUTHORS b/AUTHORS
new file mode 100644 (file)
index 0000000..6b94837
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1 @@
+Mika Laitio <lamikr@pilppa.org>
\ No newline at end of file
diff --git a/COPYING b/COPYING
new file mode 100644 (file)
index 0000000..b2f02dd
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,165 @@
+               GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/ChangeLog b/ChangeLog
new file mode 100644 (file)
index 0000000..9c5e960
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1 @@
+Feb 7, 2010: Released first version
\ No newline at end of file
diff --git a/INSTALL b/INSTALL
new file mode 100644 (file)
index 0000000..196d452
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,18 @@
+                       libcharencoding install
+                        =======================
+
+
+This library uses gnu automake tools for configuring the libraries. 
+
+You can configure the library for your system and create makefiles by using command:
+
+       autobuild.sh
+       
+Once Makefiles has been generated, you can use either:
+
+       make or 
+       autobuild.sh
+
+To clean-up all generated temporarily files you can use command:
+
+       autoclean.sh
\ No newline at end of file
diff --git a/Makefile.am b/Makefile.am
new file mode 100644 (file)
index 0000000..4773b40
--- /dev/null
@@ -0,0 +1,5 @@
+SUBDIRS = src \
+       src_test
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libcharencoding.pc
+EXTRA_DIST = libcharencoding.pc.in
diff --git a/NEWS b/NEWS
new file mode 100644 (file)
index 0000000..c7ab92a
--- /dev/null
+++ b/NEWS
@@ -0,0 +1 @@
+See ChangeLog
\ No newline at end of file
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..550601f
--- /dev/null
+++ b/README
@@ -0,0 +1,5 @@
+This library provides methods for converting c strings between different encodings.
+Library uses requires iconv functions that are available nowadays in glib.
+
+API:
+- see Charencoding.h and libcharencoding_test.c
\ No newline at end of file
diff --git a/autobuild.sh b/autobuild.sh
new file mode 100755 (executable)
index 0000000..97c69a9
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+if ! [ -e Makefile ] ;
+then
+       echo "generating build system files for target platform"
+       libtoolize --automake --force --copy
+       aclocal
+       autoheader
+       touch stamp-h
+       autoconf
+       automake -a -c
+        ./configure --prefix=/usr/local
+else
+       echo "no need to generate makefiles"
+fi
+make
diff --git a/autoclean.sh b/autoclean.sh
new file mode 100755 (executable)
index 0000000..21cc35a
--- /dev/null
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+# call make maintainers clean first
+if [ -e Makefile ] ;
+then
+       make maintainer-clean
+fi
+
+# then clean all other files that can be generated by autobuild.sh
+rm -f aclocal.m4
+rm -f compile
+rm -f config.h
+rm -f config.h.in
+rm -f config.log
+rm -f config.status
+rm -f config.guess
+rm -f config.sub
+rm -f configure
+rm -f depcomp
+rm -f install-sh
+rm -f libtool
+rm -f ltmain.sh
+rm -f Makefile.in
+rm -f missing
+rm -f stamp-h
+rm -f stamp-h1
+
+rm -rf autom4te.cache
+
+rm -f src/Makefile.in
+rm -rf src/.libs
+rm -rf src/.deps
+rm -rf src/*.gcov
+rm -rf src/*.gcno
+rm -rf src/*.gcda
+
+rm -f src_test/Makefile.in
+rm -rf src_test/.libs
+rm -rf src_test/.deps
+rm -rf src_test/*.gcov
+rm -rf src_test/*.gcno
+rm -rf src_test/*.gcda
diff --git a/configure.ac b/configure.ac
new file mode 100644 (file)
index 0000000..01c1c9e
--- /dev/null
@@ -0,0 +1,33 @@
+AC_INIT(src/charencoding.c)
+AM_CONFIG_HEADER(config.h)
+
+PACKAGE=libcharencoding
+VERSION=0.1.0
+
+CFLAGS="$CFLAGS -g -Wall -Werror -ggdb"
+LDFLAGS="$LDFLAGS"
+AC_SUBST(CFLAGS)
+AC_SUBST(LDFLAGS)
+AC_MSG_NOTICE([libcharencoding Makefile])
+
+AM_INIT_AUTOMAKE($PACKAGE, $VERSION)
+
+AC_PROG_CC
+AC_PROG_OBJC
+AC_STDC_HEADERS
+AC_PROG_LIBTOOL
+AC_PROG_INSTALL
+AM_PROG_CC_C_O
+
+PKG_PROG_PKG_CONFIG()
+
+#PKG_CHECK_MODULES(SRC, )
+#AC_SUBST(SRC_CFLAGS)
+#AC_SUBST(SRC_LIBS)
+
+AC_OUTPUT([
+libcharencoding.pc
+Makefile
+src/Makefile
+src_test/Makefile
+])
diff --git a/libcharencoding.pc.in b/libcharencoding.pc.in
new file mode 100644 (file)
index 0000000..3360a37
--- /dev/null
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libcharencoding
+Description: character set detection and conversion library
+Version: @VERSION@
+Requires: 
+Cflags: -I${includedir} -I${includedir}/libcharencoding
+Libs: -L${libdir}
diff --git a/src/Makefile.am b/src/Makefile.am
new file mode 100644 (file)
index 0000000..95a92f9
--- /dev/null
@@ -0,0 +1,11 @@
+lib_LTLIBRARIES = libcharencoding.la
+libcharencoding_la_SOURCES = \
+       charencoding.c charencoding.h \
+       internal/charencoding_internal.c internal/charencoding_internal.h \ 
+       internal/utf8.c internal/utf8.h
+libcharencoding_la_LDFLAGS = $(DEPS_LIBS) $(all_libraries) -version-info 1:0:0 -no-undefined
+AM_CPPFLAGS = $(DEPS_CFLAGS)
+
+libcharencodingincludedir=$(includedir)/libcharencoding
+libcharencodinginclude_HEADERS =   \
+        charencoding.h
diff --git a/src/charencoding.c b/src/charencoding.c
new file mode 100644 (file)
index 0000000..6ef76e9
--- /dev/null
@@ -0,0 +1,128 @@
+/**
+ * Copyright (c) 2009-2010 Mika Laitio <lamikr@pilppa.org>
+ *
+ * This file and library is covered by the LGPL version 3, read LICENSE for details.
+ *
+ * History:
+ * - Created charencoding.c on: Nov 25, 2009
+ */
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <iconv.h>
+
+#include "internal/charencoding_internal.h"
+#include "internal/utf8.h"
+
+#define LOCALE_CHARSET_UTF8 "UTF-8"
+
+static bool _locale_charset_utf8_check_initialized     = false;
+static bool _locale_is_utf8                            = false;
+
+bool is_utf8_locale()
+{
+       const char *locale_charset;
+
+       if (_locale_charset_utf8_check_initialized == false) {
+               locale_charset = get_locale_charset_alias();
+               if (strcmp(locale_charset, LOCALE_CHARSET_UTF8) == 0) {
+                       _locale_is_utf8 = true;
+               }
+               else {
+                       _locale_is_utf8 = false;
+               }
+               _locale_charset_utf8_check_initialized = true;
+       }
+       return _locale_is_utf8;
+}
+
+char *str_encode(const char *src,
+               const char *src_charset,
+               const char *trgt_charset,
+               int *err_flg)
+{
+       iconv_t conversion_desc;
+       char    *ret_val;
+       int     cmp_res;
+       size_t  src_length;
+       size_t  trgt_length;
+
+       ret_val         = NULL;
+       *err_flg        = 0;
+       if (src != NULL) {
+               cmp_res = strcasecmp(src_charset,
+                                trgt_charset);
+               if ((cmp_res == 0) ||
+                   (strlen(src) == 0)) {
+                       ret_val = strdup(src);
+                       if (ret_val == NULL) {
+                               *err_flg        = ENOMEM;
+                       }
+               }
+               else {
+                       conversion_desc = iconv_open(trgt_charset,
+                                               src_charset);
+                       if (conversion_desc != (iconv_t)-1) {
+                               src_length      = strlen(src);
+                               trgt_length     = 0;
+                               ret_val         = str_iconv_encode(conversion_desc,
+                                                               src,
+                                                               src_length,
+                                                               &trgt_length,
+                                                               err_flg);
+                               if (*err_flg == 0) {
+                                       *err_flg        = iconv_close(conversion_desc);
+                               }
+                               else {
+                                       if (ret_val != NULL) {
+                                               free(ret_val);
+                                               ret_val = NULL;
+                                       }
+                               }
+                       }
+                       else {
+                               *err_flg        = ENOMEM;
+                       }
+               }
+       }
+       return ret_val;
+}
+
+char *utf8_encode(char *src,
+               int *err_flg)
+{
+       const char      *charset;
+       char            *ret_val;
+
+       ret_val = NULL;
+       if (src != NULL) {
+               if (is_utf8_locale() == true) {
+                       ret_val = strdup(src);
+               }
+               else {
+                       charset = get_locale_charset_alias();
+                       ret_val = str_encode(src,
+                                       charset,
+                                       LOCALE_CHARSET_UTF8,
+                                       err_flg);
+               }
+       }
+       return ret_val;
+}
+
+int utf8_printf(char *fmt, ...)
+{
+       int     ret_val;
+       va_list args;
+
+       va_start(args, fmt);
+       {
+               ret_val = utf8_vprintf(fmt, args);
+       }
+       va_end(args);
+       return ret_val;
+}
diff --git a/src/charencoding.h b/src/charencoding.h
new file mode 100644 (file)
index 0000000..39ee1c9
--- /dev/null
@@ -0,0 +1,24 @@
+/**
+ * Copyright (c) 2009-2010 Mika Laitio <lamikr@pilppa.org>
+ *
+ * This file and library is covered by the LGPL version 3, read LICENSE for details.
+ *
+ * History:
+ * - Created charencoding.h on: Nov 25, 2009
+ */
+
+#ifndef CHARENCODING_H_
+#define CHARENCODING_H_
+
+#include <stdio.h>
+#include <stdbool.h>
+
+bool is_utf8_locale();
+char *str_encode(const char *src,
+               const char *src_codeset,
+               const char *trgt_codeset,
+               int *err_flg);
+char *utf8_encode(char *input_str, int *err_flg);
+int utf8_printf(char *fmt, ...);
+
+#endif /* CHARENCODING_H_ */
diff --git a/src/internal/charencoding_internal.c b/src/internal/charencoding_internal.c
new file mode 100644 (file)
index 0000000..f56f67b
--- /dev/null
@@ -0,0 +1,141 @@
+/**
+  * Copyright (c) 2009-2010 Mika Laitio <lamikr@pilppa.org>
+  *
+  * This file and library is covered by the LGPL version 3, read LICENSE for details.
+  *
+  * History:
+  * - Created charencoding_internal.c on: Nov 25, 2009
+  */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <locale.h>
+#include <langinfo.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <iconv.h>
+#include <limits.h>
+#include <iconv.h>
+#include <time.h>
+#include <ctype.h>
+
+#define SIZE_CALCULATION_TEMP_BUFFER_SIZE      8
+
+static bool _locale_charset_initialized                        = false;
+
+const char *get_locale_charset_alias()
+{
+       if (_locale_charset_initialized == false) {
+               setlocale(LC_ALL, "");
+               _locale_charset_initialized = true;
+       }
+       return nl_langinfo(CODESET);
+}
+
+size_t calculate_target_charset_bytecount(iconv_t conversion_desc,
+                                       const char *src,
+                                       size_t bcount_src)
+{
+       size_t          ret_val;
+       size_t          err_flg;
+       size_t          bcount_trgt;
+       const char      *p_src;
+       char            *p_trgt;
+       char            tmp_buf[SIZE_CALCULATION_TEMP_BUFFER_SIZE];
+
+       ret_val         = 0;
+       err_flg         = 0;
+       // reset conversion descriptor to initial state
+       iconv(conversion_desc,
+               NULL,
+               NULL,
+               NULL,
+               NULL);
+       p_src   = src;
+       while (bcount_src > 0) {
+               p_trgt          = tmp_buf;
+               bcount_trgt     = SIZE_CALCULATION_TEMP_BUFFER_SIZE;
+               err_flg         = iconv(conversion_desc,
+                                       (char **)&p_src,
+                                       &bcount_src,
+                                       &p_trgt,
+                                       &bcount_trgt);
+               if (err_flg == (size_t)(-1)) {
+                       if (errno == E2BIG) {
+                               // too small tmp_buf error can be skipped
+                               err_flg = 0;
+                       }
+                       else {
+                               err_flg = -1;
+                               break;
+                       }
+               }
+               ret_val = ret_val + SIZE_CALCULATION_TEMP_BUFFER_SIZE - bcount_trgt;
+       }
+       if (err_flg != 0) {
+               ret_val =-1;
+       }
+       printf("size: %d\n", ret_val);
+       return ret_val;
+}
+
+char *str_iconv_encode(iconv_t conversion_desc,
+               const char *src,
+               size_t bcount_src,
+               size_t *p_bcount,
+               int *err_flg)
+{
+       char            *ret_val;
+       char            *p_ret_val;
+       const char      *p_src;
+       size_t          bcount_src_left;
+       size_t          bcount_trgt_left;
+       size_t          ic_ret_val;
+
+       ret_val         = NULL;
+       *err_flg        = 0;
+       *p_bcount       = calculate_target_charset_bytecount(conversion_desc,
+                                               src,
+                                               bcount_src);
+       if (*p_bcount >= 0) {
+               ret_val = (char *)calloc(1, *p_bcount + 1);
+               if (ret_val != NULL) {
+                       // reset conversion descriptor to initial state
+                       iconv(conversion_desc,
+                               NULL,
+                               NULL,
+                               NULL,
+                               NULL);
+                       p_src                   = src;
+                       bcount_src_left         = bcount_src;
+                       p_ret_val               = ret_val;
+                       bcount_trgt_left        = *p_bcount;
+                       while (bcount_src_left > 0) {
+                               errno           = 0;
+                               ic_ret_val      = iconv(conversion_desc,
+                                                       (char **)&p_src,
+                                                       &bcount_src_left,
+                                                       &p_ret_val,
+                                                       &bcount_trgt_left);
+                               if (ic_ret_val == (size_t)(-1)) {
+                                       free(ret_val);
+                                       ret_val         = NULL;
+                                       *err_flg        = -1;
+                                       break;
+                               }
+                       }
+               }
+               else {
+                       *err_flg        = -1;
+               }
+       }
+       else {
+               // could not calculate the byte count
+               *err_flg        = -1;
+       }
+       if (*err_flg != 0) {
+               *p_bcount       = -1;
+       }
+       return ret_val;
+}
diff --git a/src/internal/charencoding_internal.h b/src/internal/charencoding_internal.h
new file mode 100644 (file)
index 0000000..c26a355
--- /dev/null
@@ -0,0 +1,20 @@
+/**
+  * Copyright (c) 2009-2010 Mika Laitio <lamikr@pilppa.org>
+  *
+  * This file and library is covered by the LGPL version 3, read LICENSE for details.
+  *
+  * History:
+  * - Created charencoding_internal.h on: Nov 25, 2009
+  */
+
+#ifndef CHARENCODING_INTERNAL_H_
+#define CHARENCODING_INTERNAL_H_
+
+const char *get_locale_charset_alias();
+char *str_iconv_encode(iconv_t conversion_desc,
+               const char *src,
+               size_t src_bcount,
+               size_t *p_bcount,
+               int *err_flg);
+
+#endif /* CHARENCODING_INTERNAL_H_ */
diff --git a/src/internal/utf8.c b/src/internal/utf8.c
new file mode 100644 (file)
index 0000000..7525dc8
--- /dev/null
@@ -0,0 +1,115 @@
+/**
+  * Copyright (c) 2009-2010 Mika Laitio <lamikr@pilppa.org>
+  *
+  * This file and library is covered by the LGPL version 3, read LICENSE for details.
+  *
+  * History:
+  * - utf8.c Nov 25, 2009: small modifications of original files to fit to libcharencoding
+  * - Based on to basic UTF-8 manipulation routines
+  * by Jeff Bezanson
+  * placed in the public domain Fall 2005
+  *
+  * This code is designed to provide the utilities you need to manipulate
+  * UTF-8 as an internal string encoding. These functions do not perform the
+  * error checking normally needed when handling UTF-8 data, so if you happen
+  * to be from the Unicode Consortium you will want to flay me alive.
+  * I do this because error checking can be performed at the boundaries (I/O),
+  * with these routines reserved for higher performance on data known to be
+  * valid.
+  */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+#include <malloc.h>
+#include <stdbool.h>
+
+static const u_int32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL,
+               0x82082080UL };
+
+static const char trailingBytesForUTF8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
+               2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
+
+/* conversions without error checking
+ only works for valid UTF-8, i.e. no 5- or 6-byte sequences
+ srcsz = source size in bytes, or -1 if 0-terminated
+ sz = dest size in # of wide characters
+
+ returns # characters converted
+ dest will always be L'\0'-terminated, even if there isn't enough room
+ for all the characters.
+ if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
+ */
+int utf8_toucs(u_int32_t *dest, int sz, char *src, int srcsz)
+{
+       u_int32_t ch;
+       char *src_end = src + srcsz;
+       int nb;
+       int i = 0;
+
+       while (i < sz - 1) {
+               nb = trailingBytesForUTF8[(unsigned char) *src];
+               if (srcsz == -1) {
+                       if (*src == 0)
+                               break;
+               }
+               else {
+                       if (src + nb >= src_end)
+                               break;
+               }
+               ch = 0;
+               switch (nb) {
+               /* these fall through deliberately */
+               case 3:
+                       ch += (unsigned char) *src++;
+                       ch <<= 6;
+               case 2:
+                       ch += (unsigned char) *src++;
+                       ch <<= 6;
+               case 1:
+                       ch += (unsigned char) *src++;
+                       ch <<= 6;
+               case 0:
+                       ch += (unsigned char) *src++;
+               }
+               ch -= offsetsFromUTF8[nb];
+               dest[i++] = ch;
+       }
+       dest[i] = 0;
+       return i;
+}
+
+int utf8_vprintf(char *fmt, va_list ap)
+{
+       int             cnt;
+       int             sz;
+       char            *buf;
+       u_int32_t       *wcs;
+       bool            do_loop;
+
+       cnt     = 0;
+       sz      = 512;
+       buf     = (char*) alloca(sz);
+       do_loop = true;
+       while (do_loop) {
+               cnt = vsnprintf(buf, sz, fmt, ap);
+               if (cnt >= sz) {
+                       buf = (char*) alloca(cnt - sz + 1);
+                       sz = cnt + 1;
+               }
+               else {
+                       do_loop = false;
+               }
+       }
+       wcs     = (u_int32_t*) alloca((cnt + 1) * sizeof(u_int32_t));
+       cnt     = utf8_toucs(wcs, cnt + 1, buf, cnt);
+       printf("%ls", (wchar_t*) wcs);
+       return cnt;
+}
diff --git a/src/internal/utf8.h b/src/internal/utf8.h
new file mode 100644 (file)
index 0000000..6946ea9
--- /dev/null
@@ -0,0 +1,26 @@
+/**
+  * Copyright (c) 2009-2010 Mika Laitio <lamikr@pilppa.org>
+  *
+  * This file and library is covered by the LGPL version 3, read LICENSE for details.
+  *
+  * History:
+  * - utf8.c Nov 25, 2009: small modifications of original files to fit to libcharencoding
+  * - Based on to basic UTF-8 manipulation routines
+  * by Jeff Bezanson
+  * placed in the public domain Fall 2005
+  *
+  * This code is designed to provide the utilities you need to manipulate
+  * UTF-8 as an internal string encoding. These functions do not perform the
+  * error checking normally needed when handling UTF-8 data, so if you happen
+  * to be from the Unicode Consortium you will want to flay me alive.
+  * I do this because error checking can be performed at the boundaries (I/O),
+  * with these routines reserved for higher performance on data known to be
+  * valid.
+  */
+
+#ifndef UTF8_H_
+#define UTF8_H_
+
+int utf8_vprintf(char *fmt, va_list ap);
+
+#endif /* UTF8_H_ */
diff --git a/src_test/Makefile.am b/src_test/Makefile.am
new file mode 100644 (file)
index 0000000..87ed594
--- /dev/null
@@ -0,0 +1,6 @@
+bin_PROGRAMS = libcharencoding_test
+
+libcharencoding_test_SOURCES = libcharencoding_test.c
+libcharencoding_test_LDADD = ../src/.libs/libcharencoding.a -lreadline
+
+AM_CPPFLAGS = $(DEPS_CFLAGS) -I../src
diff --git a/src_test/libcharencoding_test.c b/src_test/libcharencoding_test.c
new file mode 100644 (file)
index 0000000..3d6765c
--- /dev/null
@@ -0,0 +1,34 @@
+/**
+ * Copyright (c) 2009-2010 Mika Laitio <lamikr@pilppa.org>
+ *
+ * This library is covered by the LGPL version 3, read LICENSE for details.
+ *
+ * History:
+ * - Created libcharencoding_test.c on: Nov 25, 2009
+ */
+
+#include <stdio.h>
+#include <readline/readline.h>
+
+#include "charencoding.h"
+
+char *get_input_encoded_to_utf8()
+{
+       char    *text;
+       char    *ret_val;
+       int     err_flg;
+
+       printf("enter text: ");
+       text    = readline("");
+       ret_val = utf8_encode(text, &err_flg);
+       return ret_val;
+}
+
+int main(void)
+{
+       char    *utf8s;
+       printf("is_utf8_locale: %d\n", is_utf8_locale());
+       utf8s = get_input_encoded_to_utf8();
+       utf8_printf("%s\n", utf8s);
+       return 0;
+}