/** * Convert from Windows-1252 to UTF-8 * Note that Windows-1252 seems to be a superset of Latin-1 (ISO 8859-1). * * Usage: 2utf8 [file ...] * * BUG: There are a few chars undefined in Windows-1252 * which will map to a question mark. */ #include #include #include static int unicode[32]; /** * Get the next byte and convert it to unicode. * This is the "win" part of the name. * What is returned is actually a unicode code point. * BUG? Should this abort when it gets a byte not in Windows-1252? * 16 bits is sufficient for the task at hand. * All the code points for the characters in Windows-1252 * are below 0x3000. */ int getNext( FILE *infp ) { int c = fgetc( infp ); if ( EOF == c ) { return EOF; } if ( 0x7f < c && c < 0xa0 ) { return unicode[c - 128]; } return c; } /** * Put the unicode character out encoded in UTF-8. * This is the "utf" part of the name. */ void putBytes( int theChar ) { int firstByte, secondByte, thirdByte; if ( 0xfff < theChar ) { /* then we need three bytes */ thirdByte = 0x80 | ( theChar & 0x3f ); theChar >>= 6; secondByte = 0x80 | ( theChar & 0x3f ); theChar >>= 6; firstByte = 0xe0 | theChar; putchar(firstByte); putchar(secondByte); putchar(thirdByte); } else if ( 0x7f < theChar ) { /* we only need two */ secondByte = 0x80 | ( theChar & 0x3f ); theChar >>= 6; firstByte = 0xc0 | theChar; putchar(firstByte); putchar(secondByte); } else { putchar( theChar ); } } void doFiles( int argc, char **argv ) { FILE *infp; char *infile; int c; while ( --argc > 0 ) { infile = *++argv; if ( 0 == strcmp( infile, "-" ) ) { infp = stdin; } /* Must open input in binary mode so we can catch chars with high bit set. */ else if ( ( infp = fopen( infile, "rb" ) ) == NULL ) { fprintf( stderr, "Unable to open input file \"%s\".\n", infile ); continue; } while ( ( c = getNext( infp ) ) != EOF ) { putBytes( c ); } } } int main( int argc, char **argv ) { char * oneName[2]; unicode[0x00] = 0x20ac; unicode[0x01] = '?'; unicode[0x02] = 0x201a; unicode[0x03] = 0x0192; unicode[0x04] = 0x201e; unicode[0x05] = 0x2026; unicode[0x06] = 0x2020; unicode[0x07] = 0x2021; unicode[0x08] = 0x02c6; unicode[0x09] = 0x2030; unicode[0x0a] = 0x0160; unicode[0x0b] = 0x2039; unicode[0x0c] = 0x0152; unicode[0x0d] = '?'; unicode[0x0e] = 0x017d; unicode[0x0f] = '?'; unicode[0x10] = '?'; unicode[0x11] = 0x2018; unicode[0x12] = 0x2019; unicode[0x13] = 0x201c; unicode[0x14] = 0x201d; unicode[0x15] = 0x2022; unicode[0x16] = 0x2013; unicode[0x17] = 0x2014; unicode[0x18] = 0x02dc; unicode[0x19] = 0x2122; unicode[0x1a] = 0x0161; unicode[0x1b] = 0x203a; unicode[0x1c] = 0x0153; unicode[0x1d] = '?'; unicode[0x1e] = 0x017e; unicode[0x1f] = 0x0178; if ( argc < 2 ) { oneName[0] = argv[0]; oneName[1] = "-"; doFiles( 2, oneName ); } else { doFiles( argc, argv ); } return 0; }