Detecting "letter-like" UTF-8 code points

**Sir Galahad** · 03-09-2021

So I'm currently working on this project which I've already added some very rudimentary UTF-8 support. What I'd really like to do though is be able to detect code points which represent conceptual "letter" types. So far I've only seen vague references to doing this sort of thing in the specs (https://www.unicode.org/versions/Uni...04.pdf#G134153).

Here's a toy program to show basically where I'm at on this:

Code:

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>

typedef unsigned long utf8_rune;

void output(utf8_rune glyph)
{
 bool is_letter_like = false;
/*
 ...somehow detect valid letter types here...
*/
 printf("0x%zu: %s\n", glyph, is_letter_like ? "true" : "false");
}

bool output_utf8_glyphs(void* utf8, size_t length)
{
 typedef unsigned char byte;
 byte* next = utf8;
 byte* end = next + length; 
 for(;;)
 {
  if(next == end)
   break;
  byte octet = *next++;
  utf8_rune glyph = octet;  
  if(octet > 0x7f)
  {
   if(octet < 0xe0)
   {
    if((next + 1) > end)
     return false;
    glyph |= ((*next++) << 8); 
   }
   else if(octet < 0xf0)
   {
    if((next + 2) > end)
     return false;
    glyph |= ((*next++) << 8); 
    glyph |= ((*next++) << 16); 
   }
   else if(octet < 0xf8)
   {
    if((next + 3) > end)
     return false;
    glyph |= ((*next++) << 8); 
    glyph |= ((*next++) << 16);
    glyph |= ((*next++) << 24);     
   }
  } 
  output(glyph);   
 }
 return true;
}

int main(int argc, char** argv)
{
 if(argc == 1)
 {
  char utf8[] = 
"  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,\n"
"  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς\n"
"  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ\n"
"  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿\n" 
"  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ\n"
"  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν\n"
"  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,\n"
"  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν\n"
"  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον\n"
"  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι\n"
"  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν\n"
"  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους\n"
"  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ\n"
"  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ\n"
"  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς\n"
"  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.\n";
  if(!output_utf8_glyphs(utf8, sizeof(utf8)))
   puts("Invalid unicode!"); 
 }
 else for(;;)
 {
  char* path = *(++argv);
  if(!path)
   break;  
  FILE* stream = fopen(path, "rb");
  if(!stream)
  {
   fprintf(stderr, "Error: cannot process file '%s'\n", path);
   continue;
  }
  fseek(stream, 0, SEEK_END);
  size_t size = ftell(stream);
  rewind(stream);
  char* utf8 = malloc(size + 1);
  fread(utf8, 1, size, stream);
  utf8[size] = 0;
  fclose(stream); 
  if(!output_utf8_glyphs(utf8, size))
   puts("Invalid unicode!");
  free(utf8);
 }
 return 0;
}

**CoiledAlizarine** · 03-09-2021

You can change to the UTF-8 codepage using setlocale() and use iswalpha()

Code:

#include <stdio.h>

#include <wchar.h>
#include <wctype.h>
#include <locale.h>

int main() {
    setlocale(LC_ALL, ".utf8");
    printf("%d\n", iswalpha(L'\x0156') > 0);
}

\x0156 is the UTF-8 character Ŗ

**Sir Galahad** · 03-09-2021

Originally Posted by CoiledAlizarine

You can change to the UTF-8 codepage using setlocale() and use iswalpha()

Code:

#include <stdio.h>

#include <wchar.h>
#include <wctype.h>
#include <locale.h>

int main() {
    setlocale(LC_ALL, ".utf8");
    printf("%d\n", iswalpha(L'\x0156') > 0);
}

\x0156 is the UTF-8 character Ŗ

Interesting idea! Unfortunately, it doesn't work very reliably. Windows handled it correctly, but Linux just seems confused. I was eventually able to get it to recognize them...after setting the locale to "en_GB.UTF-8"!

Code:

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
#include <locale.h>

typedef unsigned long utf8_rune;

void output(utf8_rune glyph)
{
 bool is_letter_like = iswalpha(glyph);
 printf("0x%x: %s\n", (unsigned)glyph, is_letter_like ? "true" : "false");
}

bool output_utf8_glyphs(void* utf8, size_t length)
{
 typedef unsigned char byte;
 byte* next = utf8;
 byte* end = next + length;
 for(;;)
 {
  if(next == end)
   break;
  byte octet = *next++;
  utf8_rune glyph = octet;
  if(octet > 0x7f)
  {
   if(octet < 0xe0)
   {
    if((next + 1) > end)
     return false;
    glyph <<= 8;
    glyph |= *next++;
   }
   else if(octet < 0xf0)
   {
    if((next + 2) > end)
     return false;
    glyph <<= 8;
    glyph |= *next++;
    glyph <<= 8;
    glyph |= *next++;
   }
   else if(octet < 0xf8)
   {
    if((next + 3) > end)
     return false;
    glyph <<= 8;
    glyph |= *next++;
    glyph <<= 8;
    glyph |= *next++;
    glyph <<= 8;
    glyph |= *next++;
   }
  }
  output(glyph);
 }
 return true;
}

int main(int argc, char** argv)
{
 setlocale(LC_ALL, "en_GB.UTF-8");
 if(argc == 1)
 {
  char utf8[] =
"  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,\n"
"  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς\n"
"  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ\n"
"  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿\n"
"  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ\n"
"  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν\n"
"  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,\n"
"  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν\n"
"  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον\n"
"  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι\n"
"  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν\n"
"  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους\n"
"  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ\n"
"  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ\n"
"  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς\n"
"  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.\n"
"  ŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖ\n";
  if(!output_utf8_glyphs(utf8, sizeof(utf8)))
   puts("Invalid unicode!");
 }
 else for(;;)
 {
  char* path = *(++argv);
  if(!path)
   break;
  FILE* stream = fopen(path, "rb");
  if(!stream)
  {
   fprintf(stderr, "Error: cannot process file '%s'\n", path);
   continue;
  }
  fseek(stream, 0, SEEK_END);
  size_t size = ftell(stream);
  rewind(stream);
  char* utf8 = malloc(size + 1);
  fread(utf8, 1, size, stream);
  utf8[size] = 0;
  fclose(stream);
  if(!output_utf8_glyphs(utf8, size))
   puts("Invalid unicode!");
  free(utf8);
 }
 return 0;
}

Maybe a more direct approach is needed here?

**john.c** · 03-09-2021

Your conversion of UTF-8 to a unicode code point is wrong.
For example, it converts ό (greek small letter omicron with oxia) to 0x12172769 instead of the correct 0x1F79.
It should be something like (can probably be made shorter) :

Code:

bool output_utf8(const byte* utf8, size_t length)
{
    const byte* end = utf8 + length;
    while (utf8 < end)
    {
        // Code Point Range      Byte 1    Byte 2    Byte 3    Byte 4
        // U+0000  to U+007F     0xxxxxxx
        // U+0080  to U+07FF     110xxxxx  10xxxxxx
        // U+0800  to U+FFFF     1110xxxx  10xxxxxx  10xxxxxx
        // U+10000 to U+10FFFF   11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
        unsigned code = 0;
        if ((*utf8 & 0x80) == 0)
            code = *utf8++;
        else if ((*utf8 & 0xE0) == 0xC0)
        {
            code = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F);
            utf8 += 2;
        }
        else if ((*utf8 & 0xF0) == 0xE0)
        {
            code = ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F);
            utf8 += 3;
        }
        else if ((*utf8 & 0xF8) == 0xF0)
        {
            code = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) | ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F);
            utf8 += 4;
        }
        else
            return false;
        output(code);
    }
    return true;
}

**Sir Galahad** · 03-09-2021

Originally Posted by john.c

Your conversion of UTF-8 to a unicode code point is wrong.
For example, it converts ό (greek small letter omicron with oxia) to 0x12172769 instead of the correct 0x1F79.
It should be something like (can probably be made shorter) :

Code:

bool output_utf8(const byte* utf8, size_t length)
{
    const byte* end = utf8 + length;
    while (utf8 < end)
    {
        // Code Point Range      Byte 1    Byte 2    Byte 3    Byte 4
        // U+0000  to U+007F     0xxxxxxx
        // U+0080  to U+07FF     110xxxxx  10xxxxxx
        // U+0800  to U+FFFF     1110xxxx  10xxxxxx  10xxxxxx
        // U+10000 to U+10FFFF   11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
        unsigned code = 0;
        if ((*utf8 & 0x80) == 0)
            code = *utf8++;
        else if ((*utf8 & 0xE0) == 0xC0)
        {
            code = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F);
            utf8 += 2;
        }
        else if ((*utf8 & 0xF0) == 0xE0)
        {
            code = ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F);
            utf8 += 3;
        }
        else if ((*utf8 & 0xF8) == 0xF0)
        {
            code = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) | ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F);
            utf8 += 4;
        }
        else
            return false;
        output(code);
    }
    return true;
}

Thank John, it works perfectly! Optimizations can be done later anyway.

Code:


#include <locale.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <wctype.h>

typedef unsigned utf8_rune;
typedef unsigned char byte;

void output(utf8_rune glyph) {
  bool is_letter_like = iswalpha(glyph);
  printf("0x%x: %s\n", (unsigned)glyph, is_letter_like ? "true" : "false");
}

bool output_utf8(const byte *utf8, size_t length) {
  const byte *end = utf8 + length;
  while (utf8 < end) {
    // Code Point Range      Byte 1    Byte 2    Byte 3    Byte 4
    // U+0000  to U+007F     0xxxxxxx
    // U+0080  to U+07FF     110xxxxx  10xxxxxx
    // U+0800  to U+FFFF     1110xxxx  10xxxxxx  10xxxxxx
    // U+10000 to U+10FFFF   11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
    unsigned code = 0;
    if ((*utf8 & 0x80) == 0)
      code = *utf8++;
    else if ((*utf8 & 0xE0) == 0xC0) {
      code = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F);
      utf8 += 2;
    } else if ((*utf8 & 0xF0) == 0xE0) {
      code =
          ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F);
      utf8 += 3;
    } else if ((*utf8 & 0xF8) == 0xF0) {
      code = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) |
             ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F);
      utf8 += 4;
    } else
      return false;
    output(code);
  }
  return true;
}

int main(int argc, char **argv) {
  setlocale(LC_ALL, "en_GB.UTF-8");
  if (argc == 1) {
    char utf8[] =
        "  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,\n"
        "  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς\n"
        "  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ\n"
        "  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿\n"
        "  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ\n"
        "  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν\n"
        "  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,\n"
        "  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν\n"
        "  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον\n"
        "  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι\n"
        "  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν\n"
        "  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους\n"
        "  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ\n"
        "  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ\n"
        "  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς\n"
        "  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.\n"
        "  ŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖ\n";
    if (!output_utf8(utf8, sizeof(utf8)))
      puts("Invalid unicode!");
  } else
    for (;;) {
      char *path = *(++argv);
      if (!path)
        break;
      FILE *stream = fopen(path, "rb");
      if (!stream) {
        fprintf(stderr, "Error: cannot process file '%s'\n", path);
        continue;
      }
      fseek(stream, 0, SEEK_END);
      size_t size = ftell(stream);
      rewind(stream);
      char *utf8 = malloc(size + 1);
      fread(utf8, 1, size, stream);
      utf8[size] = 0;
      fclose(stream);
      if (!output_utf8(utf8, size))
        puts("Invalid unicode!");
      free(utf8);
    }
  return 0;
}

There's still the issue of setlocale not always working with default settings, but that's likely just one of those quirky Linux issues. I dunno. I guess I could inspect the "LANG" environment variable as a workaround.

**Sir Galahad** · 03-09-2021

For what it's worth, I did find this https://www.unicode.org/Public/5.0.0...Properties.txt

# ================================================

# Derived Property: Alphabetic
# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic

0041..005A ; Alphabetic # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
0061..007A ; Alphabetic # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
00AA ; Alphabetic # L& FEMININE ORDINAL INDICATOR
00B5 ; Alphabetic # L& MICRO SIGN
00BA ; Alphabetic # L& MASCULINE ORDINAL INDICATOR
00C0..00D6 ; Alphabetic # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS
00D8..00F6 ; Alphabetic # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS
00F8..01BA ; Alphabetic # L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL
01BB ; Alphabetic # Lo LATIN LETTER TWO WITH STROKE

(...)

1D716..1D734 ; Alphabetic # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA
1D736..1D74E ; Alphabetic # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA
1D750..1D76E ; Alphabetic # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA
1D770..1D788 ; Alphabetic # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA
1D78A..1D7A8 ; Alphabetic # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA
1D7AA..1D7C2 ; Alphabetic # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA
1D7C4..1D7CB ; Alphabetic # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA
20000..2A6D6 ; Alphabetic # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
2F800..2FA1D ; Alphabetic # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D

# Total code points: 93217

Not the most tasty prospect, but if that is indeed a complete listing then I suppose I could try to parse it directly somehow. Kind of daunting. But then again, what do you expect? Every language has very specific ranges that fall into that particular category. It's not some simple matter of a few well-initialized lookup tables, unfortunately.

**Sir Galahad** · 03-09-2021

This seems to work across the board so far. Just setting the LC_ALL locale parameter to an empty string!

Code:

#include <locale.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <wctype.h>

typedef unsigned utf8_rune;
typedef unsigned char byte;

void output(utf8_rune glyph) {
  bool is_letter_like = iswalpha(glyph);
  printf("0x%x: %s\n", (unsigned)glyph, is_letter_like ? "true" : "false");
}

bool output_utf8(const byte *utf8, size_t length) {
  const byte *end = utf8 + length;
  while (utf8 < end) {
    // Code Point Range      Byte 1    Byte 2    Byte 3    Byte 4
    // U+0000  to U+007F     0xxxxxxx
    // U+0080  to U+07FF     110xxxxx  10xxxxxx
    // U+0800  to U+FFFF     1110xxxx  10xxxxxx  10xxxxxx
    // U+10000 to U+10FFFF   11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
    unsigned code = 0;
    if ((*utf8 & 0x80) == 0)
      code = *utf8++;
    else if ((*utf8 & 0xE0) == 0xC0) {
      code = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F);
      utf8 += 2;
    } else if ((*utf8 & 0xF0) == 0xE0) {
      code =
          ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F);
      utf8 += 3;
    } else if ((*utf8 & 0xF8) == 0xF0) {
      code = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) |
             ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F);
      utf8 += 4;
    } else
      return false;
    output(code);
  }
  return true;
}

int main(int argc, char **argv) {
  setlocale(LC_ALL, "");
  if (argc == 1) {
    char utf8[] =
        "  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,\n"
        "  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς\n"
        "  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ\n"
        "  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿\n"
        "  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ\n"
        "  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν\n"
        "  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,\n"
        "  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν\n"
        "  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον\n"
        "  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι\n"
        "  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν\n"
        "  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους\n"
        "  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ\n"
        "  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ\n"
        "  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς\n"
        "  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.\n"
        "  ŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖ\n";
    if (!output_utf8(utf8, sizeof(utf8)))
      puts("Invalid unicode!");
  } else
    for (;;) {
      char *path = *(++argv);
      if (!path)
        break;
      FILE *stream = fopen(path, "rb");
      if (!stream) {
        fprintf(stderr, "Error: cannot process file '%s'\n", path);
        continue;
      }
      fseek(stream, 0, SEEK_END);
      size_t size = ftell(stream);
      rewind(stream);
      char *utf8 = malloc(size + 1);
      fread(utf8, 1, size, stream);
      utf8[size] = 0;
      fclose(stream);
      if (!output_utf8(utf8, size))
        puts("Invalid unicode!");
      free(utf8);
    }
  return 0;
}

**john.c** · 03-10-2021

Originally Posted by Sir Galahad

This seems to work across the board so far. Just setting the LC_ALL locale parameter to an empty string!

Excellent! Apparently "" means the "user-preferred locale".
You could try printing the return value of setlocale(LC_ALL, "") (as %s).
If you're in Britain it's probably going to say en_GB.UTF-8 anyway.

Note that in the output_utf8 call, sizeof(utf8) should perhaps be one less to leave out the '\0' (although it's probably okay to process it).

One small simplification of output_utf8 is to just use < for the if's:

Code:

        if (*utf8 < 0x80)
        else if (*utf8 < 0xE0)
        else if (*utf8 < 0xF0)
        else if (*utf8 < 0xF8)
        else

**Sir Galahad** · 03-10-2021

Originally Posted by john.c

Apparently "" means the "user-preferred locale".
You could try printing the return value of setlocale(LC_ALL, "") (as %s).
If you're in Britain it's probably going to say en_GB.UTF-8 anyway.

Well I'd be happy if it just worked reliably across platforms.

Originally Posted by john.c

Note that in the output_utf8 call, sizeof(utf8) should perhaps be one less to leave out the '\0' (although it's probably okay to process it).

UTF-8 does allow arbitrary null bytes, so that won't be a problem.

Originally Posted by john.c

One small simplification of output_utf8 is to just use < for the if's:

Code:

        if (*utf8 < 0x80)
        else if (*utf8 < 0xE0)
        else if (*utf8 < 0xF0)
        else if (*utf8 < 0xF8)
        else

Even better. I ran some timing tests on it as well. No bottlenecks and plenty efficient.

Cheers!

**Salem** · 03-10-2021

Are you in complete control of your UTF-8 data?

There are malicious code sequences that can upset the apple cart.
utf8 decoder vulnerability at DuckDuckGo

**Sir Galahad** · 03-10-2021

Originally Posted by Salem

Are you in complete control of your UTF-8 data?

There are malicious code sequences that can upset the apple cart.
utf8 decoder vulnerability at DuckDuckGo

Right. Everybody wants more flexibility, better multilingual support. But UTF-8 just isn't a very good solution. (In fact, it's 💩!) ASCII's not much better, but at least it's straightforward to process.

I guess I just need to weigh the risks. The context I'll be using it is fairly passive, but it does have to potential to pass some UTF-8 input through to display functions and such. I'm not sure what sort of risk that poses though.

Anyway, good to know. Thanks!

**Malcolm McLean** · 03-10-2021

Originally Posted by Sir Galahad

Right. Everybody wants more flexibility, better multilingual support. But UTF-8 just isn't a very good solution. (In fact, it's 💩!) ASCII's not much better, but at least it's straightforward to process.

I guess I just need to weigh the risks. The context I'll be using it is fairly passive, but it does have to potential to pass some UTF-8 input through to display functions and such. I'm not sure what sort of risk that poses though.

Anyway, good to know. Thanks!

UTF-8 is backwards compatible with ascii, which is a huge advantage. UTF-16 has the disadvantage that it can't represent some code points, whilst UTF-32 is rather extravagant if most of the text is English. Another issue with multi-byte encoding is that no-one has settled on an endiannness standard for files.

So you'll find that UTF-8 is used a lot. Most functions treat strings as atomic, so the string encoding doesn't really matter. Where you do need to access individual characters, UTF-8 is a bit more awkward to use than other encodings. That's typically just before converting a character to a glyph, or for passing to or from an operating system function, or for simple text formatting.

**john.c** · 03-10-2021

An addition to the 'if/else if' chain to handle another possible error:

Code:

        if (*utf8 < 0x80)
        else if (*utf8 < 0xC0)
            return false; // out-of-place continuation byte: 10xxxxxx
        else if (*utf8 < 0xE0)
        else if (*utf8 < 0xF0)
        else if (*utf8 < 0xF8)
        else
            return false;

**Sir Galahad** · 03-10-2021

Originally Posted by john.c

An addition to the 'if/else if' chain to handle another possible error:

Code:

        if (*utf8 < 0x80)
        else if (*utf8 < 0xC0)
            return false; // out-of-place continuation byte: 10xxxxxx
        else if (*utf8 < 0xE0)
        else if (*utf8 < 0xF0)
        else if (*utf8 < 0xF8)
        else
            return false;

That helps! Here's the current version then:

Code:


#include <locale.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <wctype.h>

typedef unsigned utf8_rune;
typedef unsigned char byte;

void output(utf8_rune glyph) {
  bool is_letter_like = iswalpha(glyph);
  printf("0x%x: %s\n", (unsigned)glyph, is_letter_like ? "true" : "false");
}

bool output_utf8(const byte* utf8, size_t length) {
  const byte* end = utf8 + length;
  while (utf8 < end) {
    // Code Point Range      Byte 1    Byte 2    Byte 3    Byte 4
    // U+0000  to U+007F     0xxxxxxx
    // U+0080  to U+07FF     110xxxxx  10xxxxxx
    // U+0800  to U+FFFF     1110xxxx  10xxxxxx  10xxxxxx
    // U+10000 to U+10FFFF   11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
    unsigned code = 0;
    if (*utf8 < 0x80)
      code = *utf8++;
    else if (*utf8 < 0xC0)
      return false;  // out-of-place continuation byte: 10xxxxxx
    else if (*utf8 < 0xE0) {
      code = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F);
      utf8 += 2;
    } else if (*utf8 < 0xF0) {
      code =
          ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F);
      utf8 += 3;
    } else if (*utf8 < 0xF8) {
      code = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) |
             ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F);
      utf8 += 4;
    } else
      return false;
    output(code);
  }
  return true;
}

int main(int argc, char** argv) {
  setlocale(LC_ALL, "");
  if (argc == 1) {
    char utf8[] =
        "  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,\n"
        "  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς\n"
        "  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ\n"
        "  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿\n"
        "  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ\n"
        "  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν\n"
        "  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,\n"
        "  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν\n"
        "  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον\n"
        "  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι\n"
        "  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν\n"
        "  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους\n"
        "  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ\n"
        "  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ\n"
        "  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς\n"
        "  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.\n"
        "  ŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖ\n";
    if (!output_utf8(utf8, sizeof(utf8)))
      puts("Invalid unicode!");
  } else
    for (;;) {
      char* path = *(++argv);
      if (!path)
        break;
      FILE* stream = fopen(path, "rb");
      if (!stream) {
        fprintf(stderr, "Error: cannot process file '%s'\n", path);
        continue;
      }
      fseek(stream, 0, SEEK_END);
      size_t size = ftell(stream);
      rewind(stream);
      char* utf8 = malloc(size + 1);
      fread(utf8, 1, size, stream);
      utf8[size] = 0;
      fclose(stream);
      if (!output_utf8(utf8, size))
        puts("Invalid unicode!");
      free(utf8);
    }
  return 0;
}

Originally Posted by Malcolm McLean

UTF-8 is backwards compatible with ascii, which is a huge advantage. UTF-16 has the disadvantage that it can't represent some code points, whilst UTF-32 is rather extravagant if most of the text is English. Another issue with multi-byte encoding is that no-one has settled on an endiannness standard for files.

So you'll find that UTF-8 is used a lot. Most functions treat strings as atomic, so the string encoding doesn't really matter. Where you do need to access individual characters, UTF-8 is a bit more awkward to use than other encodings. That's typically just before converting a character to a glyph, or for passing to or from an operating system function, or for simple text formatting.

It is an improvement, compared to previous standards. Maybe they just moved too quickly to standardize it before they had ironed out all of the issues? Well we're stuck with it for now anyway. No use complaining too much about it I guess.

**Sir Galahad** · 03-10-2021

Whoops, forgot the bounds checking step!

Code:

bool output_utf8(const byte* utf8, size_t length) {
  const byte* end = utf8 + length;
  while (utf8 < end) {
    // Code Point Range      Byte 1    Byte 2    Byte 3    Byte 4
    // U+0000  to U+007F     0xxxxxxx
    // U+0080  to U+07FF     110xxxxx  10xxxxxx
    // U+0800  to U+FFFF     1110xxxx  10xxxxxx  10xxxxxx
    // U+10000 to U+10FFFF   11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
    unsigned code = 0;
    if (*utf8 < 0x80)
      code = *utf8++;
    else if (*utf8 < 0xC0)
      return false;  // out-of-place continuation byte: 10xxxxxx
    else if (*utf8 < 0xE0) {
      if ((utf8 + 2) > end)
        return false;
      code = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F);
      utf8 += 2;
    } else if (*utf8 < 0xF0) {
      if ((utf8 + 3) > end)
        return false;
      code =
          ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F);
      utf8 += 3;
    } else if (*utf8 < 0xF8) {
      if ((utf8 + 4) > end)
        return false;
      code = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) |
             ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F);
      utf8 += 4;
    } else
      return false;
    output(code);
  }
  return true;
}

Thread: Detecting "letter-like" UTF-8 code points

Thread Tools

Search Thread

Display

Detecting "letter-like" UTF-8 code points

Similar Threads

Detecting "deadlock" - Interesting question real-time C programming

How can i make a "letter could not be found" break?

Count the number of letter "a" on a sentence

"itoa"-"_itoa" , "inp"-"_inp", Why some functions have "

"CWnd"-"HWnd","CBitmap"-"HBitmap"...., What is mean by "

Tags for this Thread