/**
* Convert from Windows-1252 to UTF-8
* Note that Windows-1252 seems to be a superset of Latin-1 (ISO 8859-1).
*
* Usage: 2utf8 [file ...]
*
* BUG: There are a few chars undefined in Windows-1252
* which will map to a question mark.
*/
#include
#include
#include
static int unicode[32];
/**
* Get the next byte and convert it to unicode.
* This is the "win" part of the name.
* What is returned is actually a unicode code point.
* BUG? Should this abort when it gets a byte not in Windows-1252?
* 16 bits is sufficient for the task at hand.
* All the code points for the characters in Windows-1252
* are below 0x3000.
*/
int getNext( FILE *infp ) {
int c = fgetc( infp );
if ( EOF == c ) {
return EOF;
}
if ( 0x7f < c && c < 0xa0 ) {
return unicode[c - 128];
}
return c;
}
/**
* Put the unicode character out encoded in UTF-8.
* This is the "utf" part of the name.
*/
void putBytes( int theChar ) {
int firstByte, secondByte, thirdByte;
if ( 0xfff < theChar ) { /* then we need three bytes */
thirdByte = 0x80 | ( theChar & 0x3f );
theChar >>= 6;
secondByte = 0x80 | ( theChar & 0x3f );
theChar >>= 6;
firstByte = 0xe0 | theChar;
putchar(firstByte);
putchar(secondByte);
putchar(thirdByte);
}
else if ( 0x7f < theChar ) { /* we only need two */
secondByte = 0x80 | ( theChar & 0x3f );
theChar >>= 6;
firstByte = 0xc0 | theChar;
putchar(firstByte);
putchar(secondByte);
}
else {
putchar( theChar );
}
}
void doFiles( int argc, char **argv ) {
FILE *infp;
char *infile;
int c;
while ( --argc > 0 ) {
infile = *++argv;
if ( 0 == strcmp( infile, "-" ) ) {
infp = stdin;
}
/* Must open input in binary mode so we can catch chars with high bit set. */
else if ( ( infp = fopen( infile, "rb" ) ) == NULL ) {
fprintf( stderr, "Unable to open input file \"%s\".\n", infile );
continue;
}
while ( ( c = getNext( infp ) ) != EOF ) {
putBytes( c );
}
}
}
int main( int argc, char **argv ) {
char * oneName[2];
unicode[0x00] = 0x20ac;
unicode[0x01] = '?';
unicode[0x02] = 0x201a;
unicode[0x03] = 0x0192;
unicode[0x04] = 0x201e;
unicode[0x05] = 0x2026;
unicode[0x06] = 0x2020;
unicode[0x07] = 0x2021;
unicode[0x08] = 0x02c6;
unicode[0x09] = 0x2030;
unicode[0x0a] = 0x0160;
unicode[0x0b] = 0x2039;
unicode[0x0c] = 0x0152;
unicode[0x0d] = '?';
unicode[0x0e] = 0x017d;
unicode[0x0f] = '?';
unicode[0x10] = '?';
unicode[0x11] = 0x2018;
unicode[0x12] = 0x2019;
unicode[0x13] = 0x201c;
unicode[0x14] = 0x201d;
unicode[0x15] = 0x2022;
unicode[0x16] = 0x2013;
unicode[0x17] = 0x2014;
unicode[0x18] = 0x02dc;
unicode[0x19] = 0x2122;
unicode[0x1a] = 0x0161;
unicode[0x1b] = 0x203a;
unicode[0x1c] = 0x0153;
unicode[0x1d] = '?';
unicode[0x1e] = 0x017e;
unicode[0x1f] = 0x0178;
if ( argc < 2 ) {
oneName[0] = argv[0];
oneName[1] = "-";
doFiles( 2, oneName );
}
else {
doFiles( argc, argv );
}
return 0;
}