Thread: Detecting "letter-like" UTF-8 code points

  1. #1
    Registered User Sir Galahad's Avatar
    Join Date
    Nov 2016
    Location
    The Round Table
    Posts
    277

    Detecting "letter-like" UTF-8 code points

    So I'm currently working on this project which I've already added some very rudimentary UTF-8 support. What I'd really like to do though is be able to detect code points which represent conceptual "letter" types. So far I've only seen vague references to doing this sort of thing in the specs (https://www.unicode.org/versions/Uni...04.pdf#G134153).

    Here's a toy program to show basically where I'm at on this:

    Code:
    #include <stdio.h>
    #include <stdlib.h>
    #include <stdbool.h>
    
    typedef unsigned long utf8_rune;
    
    void output(utf8_rune glyph)
    {
     bool is_letter_like = false;
    /*
     ...somehow detect valid letter types here...
    */
     printf("0x%zu: %s\n", glyph, is_letter_like ? "true" : "false");
    }
    
    bool output_utf8_glyphs(void* utf8, size_t length)
    {
     typedef unsigned char byte;
     byte* next = utf8;
     byte* end = next + length; 
     for(;;)
     {
      if(next == end)
       break;
      byte octet = *next++;
      utf8_rune glyph = octet;  
      if(octet > 0x7f)
      {
       if(octet < 0xe0)
       {
        if((next + 1) > end)
         return false;
        glyph |= ((*next++) << 8); 
       }
       else if(octet < 0xf0)
       {
        if((next + 2) > end)
         return false;
        glyph |= ((*next++) << 8); 
        glyph |= ((*next++) << 16); 
       }
       else if(octet < 0xf8)
       {
        if((next + 3) > end)
         return false;
        glyph |= ((*next++) << 8); 
        glyph |= ((*next++) << 16);
        glyph |= ((*next++) << 24);     
       }
      } 
      output(glyph);   
     }
     return true;
    }
    
    int main(int argc, char** argv)
    {
     if(argc == 1)
     {
      char utf8[] = 
    "  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,\n"
    "  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς\n"
    "  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ\n"
    "  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿\n" 
    "  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ\n"
    "  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν\n"
    "  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,\n"
    "  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν\n"
    "  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον\n"
    "  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι\n"
    "  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν\n"
    "  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους\n"
    "  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ\n"
    "  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ\n"
    "  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς\n"
    "  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.\n";
      if(!output_utf8_glyphs(utf8, sizeof(utf8)))
       puts("Invalid unicode!"); 
     }
     else for(;;)
     {
      char* path = *(++argv);
      if(!path)
       break;  
      FILE* stream = fopen(path, "rb");
      if(!stream)
      {
       fprintf(stderr, "Error: cannot process file '%s'\n", path);
       continue;
      }
      fseek(stream, 0, SEEK_END);
      size_t size = ftell(stream);
      rewind(stream);
      char* utf8 = malloc(size + 1);
      fread(utf8, 1, size, stream);
      utf8[size] = 0;
      fclose(stream); 
      if(!output_utf8_glyphs(utf8, size))
       puts("Invalid unicode!");
      free(utf8);
     }
     return 0;
    }

  2. #2
    Registered User
    Join Date
    Feb 2021
    Posts
    6
    You can change to the UTF-8 codepage using setlocale() and use iswalpha()

    Code:
    #include <stdio.h>
    
    #include <wchar.h>
    #include <wctype.h>
    #include <locale.h>
    
    int main() {
        setlocale(LC_ALL, ".utf8");
        printf("%d\n", iswalpha(L'\x0156') > 0);
    }
    \x0156 is the UTF-8 character Ŗ

  3. #3
    Registered User Sir Galahad's Avatar
    Join Date
    Nov 2016
    Location
    The Round Table
    Posts
    277
    Quote Originally Posted by CoiledAlizarine View Post
    You can change to the UTF-8 codepage using setlocale() and use iswalpha()

    Code:
    #include <stdio.h>
    
    #include <wchar.h>
    #include <wctype.h>
    #include <locale.h>
    
    int main() {
        setlocale(LC_ALL, ".utf8");
        printf("%d\n", iswalpha(L'\x0156') > 0);
    }
    \x0156 is the UTF-8 character Ŗ
    Interesting idea! Unfortunately, it doesn't work very reliably. Windows handled it correctly, but Linux just seems confused. I was eventually able to get it to recognize them...after setting the locale to "en_GB.UTF-8"!

    Code:
    #include <stdio.h>
    #include <stdlib.h>
    #include <stdbool.h>
    #include <wchar.h>
    #include <wctype.h>
    #include <locale.h>
    
    typedef unsigned long utf8_rune;
    
    void output(utf8_rune glyph)
    {
     bool is_letter_like = iswalpha(glyph);
     printf("0x%x: %s\n", (unsigned)glyph, is_letter_like ? "true" : "false");
    }
    
    bool output_utf8_glyphs(void* utf8, size_t length)
    {
     typedef unsigned char byte;
     byte* next = utf8;
     byte* end = next + length;
     for(;;)
     {
      if(next == end)
       break;
      byte octet = *next++;
      utf8_rune glyph = octet;
      if(octet > 0x7f)
      {
       if(octet < 0xe0)
       {
        if((next + 1) > end)
         return false;
        glyph <<= 8;
        glyph |= *next++;
       }
       else if(octet < 0xf0)
       {
        if((next + 2) > end)
         return false;
        glyph <<= 8;
        glyph |= *next++;
        glyph <<= 8;
        glyph |= *next++;
       }
       else if(octet < 0xf8)
       {
        if((next + 3) > end)
         return false;
        glyph <<= 8;
        glyph |= *next++;
        glyph <<= 8;
        glyph |= *next++;
        glyph <<= 8;
        glyph |= *next++;
       }
      }
      output(glyph);
     }
     return true;
    }
    
    int main(int argc, char** argv)
    {
     setlocale(LC_ALL, "en_GB.UTF-8");
     if(argc == 1)
     {
      char utf8[] =
    "  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,\n"
    "  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς\n"
    "  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ\n"
    "  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿\n"
    "  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ\n"
    "  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν\n"
    "  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,\n"
    "  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν\n"
    "  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον\n"
    "  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι\n"
    "  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν\n"
    "  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους\n"
    "  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ\n"
    "  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ\n"
    "  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς\n"
    "  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.\n"
    "  ŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖ\n";
      if(!output_utf8_glyphs(utf8, sizeof(utf8)))
       puts("Invalid unicode!");
     }
     else for(;;)
     {
      char* path = *(++argv);
      if(!path)
       break;
      FILE* stream = fopen(path, "rb");
      if(!stream)
      {
       fprintf(stderr, "Error: cannot process file '%s'\n", path);
       continue;
      }
      fseek(stream, 0, SEEK_END);
      size_t size = ftell(stream);
      rewind(stream);
      char* utf8 = malloc(size + 1);
      fread(utf8, 1, size, stream);
      utf8[size] = 0;
      fclose(stream);
      if(!output_utf8_glyphs(utf8, size))
       puts("Invalid unicode!");
      free(utf8);
     }
     return 0;
    }
    Maybe a more direct approach is needed here?

  4. #4
    Registered User
    Join Date
    Dec 2017
    Posts
    1,633
    Your conversion of UTF-8 to a unicode code point is wrong.
    For example, it converts ό (greek small letter omicron with oxia) to 0x12172769 instead of the correct 0x1F79.
    It should be something like (can probably be made shorter) :
    Code:
    bool output_utf8(const byte* utf8, size_t length)
    {
        const byte* end = utf8 + length;
        while (utf8 < end)
        {
            // Code Point Range      Byte 1    Byte 2    Byte 3    Byte 4
            // U+0000  to U+007F     0xxxxxxx
            // U+0080  to U+07FF     110xxxxx  10xxxxxx
            // U+0800  to U+FFFF     1110xxxx  10xxxxxx  10xxxxxx
            // U+10000 to U+10FFFF   11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
            unsigned code = 0;
            if ((*utf8 & 0x80) == 0)
                code = *utf8++;
            else if ((*utf8 & 0xE0) == 0xC0)
            {
                code = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F);
                utf8 += 2;
            }
            else if ((*utf8 & 0xF0) == 0xE0)
            {
                code = ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F);
                utf8 += 3;
            }
            else if ((*utf8 & 0xF8) == 0xF0)
            {
                code = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) | ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F);
                utf8 += 4;
            }
            else
                return false;
            output(code);
        }
        return true;
    }
    A little inaccuracy saves tons of explanation. - H.H. Munro

  5. #5
    Registered User Sir Galahad's Avatar
    Join Date
    Nov 2016
    Location
    The Round Table
    Posts
    277
    Quote Originally Posted by john.c View Post
    Your conversion of UTF-8 to a unicode code point is wrong.
    For example, it converts ό (greek small letter omicron with oxia) to 0x12172769 instead of the correct 0x1F79.
    It should be something like (can probably be made shorter) :
    Code:
    bool output_utf8(const byte* utf8, size_t length)
    {
        const byte* end = utf8 + length;
        while (utf8 < end)
        {
            // Code Point Range      Byte 1    Byte 2    Byte 3    Byte 4
            // U+0000  to U+007F     0xxxxxxx
            // U+0080  to U+07FF     110xxxxx  10xxxxxx
            // U+0800  to U+FFFF     1110xxxx  10xxxxxx  10xxxxxx
            // U+10000 to U+10FFFF   11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
            unsigned code = 0;
            if ((*utf8 & 0x80) == 0)
                code = *utf8++;
            else if ((*utf8 & 0xE0) == 0xC0)
            {
                code = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F);
                utf8 += 2;
            }
            else if ((*utf8 & 0xF0) == 0xE0)
            {
                code = ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F);
                utf8 += 3;
            }
            else if ((*utf8 & 0xF8) == 0xF0)
            {
                code = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) | ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F);
                utf8 += 4;
            }
            else
                return false;
            output(code);
        }
        return true;
    }
    Thank John, it works perfectly! Optimizations can be done later anyway.

    Code:
    
    #include <locale.h>
    #include <stdbool.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <wchar.h>
    #include <wctype.h>
    
    typedef unsigned utf8_rune;
    typedef unsigned char byte;
    
    void output(utf8_rune glyph) {
      bool is_letter_like = iswalpha(glyph);
      printf("0x%x: %s\n", (unsigned)glyph, is_letter_like ? "true" : "false");
    }
    
    bool output_utf8(const byte *utf8, size_t length) {
      const byte *end = utf8 + length;
      while (utf8 < end) {
        // Code Point Range      Byte 1    Byte 2    Byte 3    Byte 4
        // U+0000  to U+007F     0xxxxxxx
        // U+0080  to U+07FF     110xxxxx  10xxxxxx
        // U+0800  to U+FFFF     1110xxxx  10xxxxxx  10xxxxxx
        // U+10000 to U+10FFFF   11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
        unsigned code = 0;
        if ((*utf8 & 0x80) == 0)
          code = *utf8++;
        else if ((*utf8 & 0xE0) == 0xC0) {
          code = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F);
          utf8 += 2;
        } else if ((*utf8 & 0xF0) == 0xE0) {
          code =
              ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F);
          utf8 += 3;
        } else if ((*utf8 & 0xF8) == 0xF0) {
          code = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) |
                 ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F);
          utf8 += 4;
        } else
          return false;
        output(code);
      }
      return true;
    }
    
    int main(int argc, char **argv) {
      setlocale(LC_ALL, "en_GB.UTF-8");
      if (argc == 1) {
        char utf8[] =
            "  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,\n"
            "  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς\n"
            "  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ\n"
            "  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿\n"
            "  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ\n"
            "  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν\n"
            "  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,\n"
            "  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν\n"
            "  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον\n"
            "  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι\n"
            "  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν\n"
            "  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους\n"
            "  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ\n"
            "  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ\n"
            "  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς\n"
            "  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.\n"
            "  ŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖ\n";
        if (!output_utf8(utf8, sizeof(utf8)))
          puts("Invalid unicode!");
      } else
        for (;;) {
          char *path = *(++argv);
          if (!path)
            break;
          FILE *stream = fopen(path, "rb");
          if (!stream) {
            fprintf(stderr, "Error: cannot process file '%s'\n", path);
            continue;
          }
          fseek(stream, 0, SEEK_END);
          size_t size = ftell(stream);
          rewind(stream);
          char *utf8 = malloc(size + 1);
          fread(utf8, 1, size, stream);
          utf8[size] = 0;
          fclose(stream);
          if (!output_utf8(utf8, size))
            puts("Invalid unicode!");
          free(utf8);
        }
      return 0;
    }
    There's still the issue of setlocale not always working with default settings, but that's likely just one of those quirky Linux issues. I dunno. I guess I could inspect the "LANG" environment variable as a workaround.
    Last edited by Sir Galahad; 03-09-2021 at 10:34 PM.

  6. #6
    Registered User Sir Galahad's Avatar
    Join Date
    Nov 2016
    Location
    The Round Table
    Posts
    277
    For what it's worth, I did find this https://www.unicode.org/Public/5.0.0...Properties.txt

    # ================================================

    # Derived Property: Alphabetic
    # Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic

    0041..005A ; Alphabetic # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
    0061..007A ; Alphabetic # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
    00AA ; Alphabetic # L& FEMININE ORDINAL INDICATOR
    00B5 ; Alphabetic # L& MICRO SIGN
    00BA ; Alphabetic # L& MASCULINE ORDINAL INDICATOR
    00C0..00D6 ; Alphabetic # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS
    00D8..00F6 ; Alphabetic # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS
    00F8..01BA ; Alphabetic # L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL
    01BB ; Alphabetic # Lo LATIN LETTER TWO WITH STROKE

    (...)

    1D716..1D734 ; Alphabetic # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA
    1D736..1D74E ; Alphabetic # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA
    1D750..1D76E ; Alphabetic # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA
    1D770..1D788 ; Alphabetic # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA
    1D78A..1D7A8 ; Alphabetic # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA
    1D7AA..1D7C2 ; Alphabetic # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA
    1D7C4..1D7CB ; Alphabetic # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA
    20000..2A6D6 ; Alphabetic # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
    2F800..2FA1D ; Alphabetic # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D

    # Total code points: 93217

    Not the most tasty prospect, but if that is indeed a complete listing then I suppose I could try to parse it directly somehow. Kind of daunting. But then again, what do you expect? Every language has very specific ranges that fall into that particular category. It's not some simple matter of a few well-initialized lookup tables, unfortunately.

  7. #7
    Registered User Sir Galahad's Avatar
    Join Date
    Nov 2016
    Location
    The Round Table
    Posts
    277
    This seems to work across the board so far. Just setting the LC_ALL locale parameter to an empty string!

    Code:
    #include <locale.h>
    #include <stdbool.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <wchar.h>
    #include <wctype.h>
    
    typedef unsigned utf8_rune;
    typedef unsigned char byte;
    
    void output(utf8_rune glyph) {
      bool is_letter_like = iswalpha(glyph);
      printf("0x%x: %s\n", (unsigned)glyph, is_letter_like ? "true" : "false");
    }
    
    bool output_utf8(const byte *utf8, size_t length) {
      const byte *end = utf8 + length;
      while (utf8 < end) {
        // Code Point Range      Byte 1    Byte 2    Byte 3    Byte 4
        // U+0000  to U+007F     0xxxxxxx
        // U+0080  to U+07FF     110xxxxx  10xxxxxx
        // U+0800  to U+FFFF     1110xxxx  10xxxxxx  10xxxxxx
        // U+10000 to U+10FFFF   11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
        unsigned code = 0;
        if ((*utf8 & 0x80) == 0)
          code = *utf8++;
        else if ((*utf8 & 0xE0) == 0xC0) {
          code = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F);
          utf8 += 2;
        } else if ((*utf8 & 0xF0) == 0xE0) {
          code =
              ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F);
          utf8 += 3;
        } else if ((*utf8 & 0xF8) == 0xF0) {
          code = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) |
                 ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F);
          utf8 += 4;
        } else
          return false;
        output(code);
      }
      return true;
    }
    
    int main(int argc, char **argv) {
      setlocale(LC_ALL, "");
      if (argc == 1) {
        char utf8[] =
            "  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,\n"
            "  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς\n"
            "  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ\n"
            "  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿\n"
            "  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ\n"
            "  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν\n"
            "  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,\n"
            "  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν\n"
            "  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον\n"
            "  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι\n"
            "  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν\n"
            "  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους\n"
            "  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ\n"
            "  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ\n"
            "  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς\n"
            "  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.\n"
            "  ŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖ\n";
        if (!output_utf8(utf8, sizeof(utf8)))
          puts("Invalid unicode!");
      } else
        for (;;) {
          char *path = *(++argv);
          if (!path)
            break;
          FILE *stream = fopen(path, "rb");
          if (!stream) {
            fprintf(stderr, "Error: cannot process file '%s'\n", path);
            continue;
          }
          fseek(stream, 0, SEEK_END);
          size_t size = ftell(stream);
          rewind(stream);
          char *utf8 = malloc(size + 1);
          fread(utf8, 1, size, stream);
          utf8[size] = 0;
          fclose(stream);
          if (!output_utf8(utf8, size))
            puts("Invalid unicode!");
          free(utf8);
        }
      return 0;
    }

  8. #8
    Registered User
    Join Date
    Dec 2017
    Posts
    1,633
    Quote Originally Posted by Sir Galahad View Post
    This seems to work across the board so far. Just setting the LC_ALL locale parameter to an empty string!
    Excellent! Apparently "" means the "user-preferred locale".
    You could try printing the return value of setlocale(LC_ALL, "") (as %s).
    If you're in Britain it's probably going to say en_GB.UTF-8 anyway.

    Note that in the output_utf8 call, sizeof(utf8) should perhaps be one less to leave out the '\0' (although it's probably okay to process it).

    One small simplification of output_utf8 is to just use < for the if's:
    Code:
            if (*utf8 < 0x80)
            else if (*utf8 < 0xE0)
            else if (*utf8 < 0xF0)
            else if (*utf8 < 0xF8)
            else
    Last edited by john.c; 03-10-2021 at 12:05 AM.
    A little inaccuracy saves tons of explanation. - H.H. Munro

  9. #9
    Registered User Sir Galahad's Avatar
    Join Date
    Nov 2016
    Location
    The Round Table
    Posts
    277
    Quote Originally Posted by john.c View Post

    Apparently "" means the "user-preferred locale".
    You could try printing the return value of setlocale(LC_ALL, "") (as %s).
    If you're in Britain it's probably going to say en_GB.UTF-8 anyway.
    Well I'd be happy if it just worked reliably across platforms.

    Quote Originally Posted by john.c View Post

    Note that in the output_utf8 call, sizeof(utf8) should perhaps be one less to leave out the '\0' (although it's probably okay to process it).
    UTF-8 does allow arbitrary null bytes, so that won't be a problem.

    Quote Originally Posted by john.c View Post

    One small simplification of output_utf8 is to just use < for the if's:
    Code:
            if (*utf8 < 0x80)
            else if (*utf8 < 0xE0)
            else if (*utf8 < 0xF0)
            else if (*utf8 < 0xF8)
            else
    Even better. I ran some timing tests on it as well. No bottlenecks and plenty efficient.

    Cheers!

  10. #10
    and the hat of int overfl Salem's Avatar
    Join Date
    Aug 2001
    Location
    The edge of the known universe
    Posts
    39,661
    Are you in complete control of your UTF-8 data?

    There are malicious code sequences that can upset the apple cart.
    utf8 decoder vulnerability at DuckDuckGo
    If you dance barefoot on the broken glass of undefined behaviour, you've got to expect the occasional cut.
    If at first you don't succeed, try writing your phone number on the exam paper.

  11. #11
    Registered User Sir Galahad's Avatar
    Join Date
    Nov 2016
    Location
    The Round Table
    Posts
    277
    Quote Originally Posted by Salem View Post
    Are you in complete control of your UTF-8 data?

    There are malicious code sequences that can upset the apple cart.
    utf8 decoder vulnerability at DuckDuckGo
    Right. Everybody wants more flexibility, better multilingual support. But UTF-8 just isn't a very good solution. (In fact, it's 💩!) ASCII's not much better, but at least it's straightforward to process.

    I guess I just need to weigh the risks. The context I'll be using it is fairly passive, but it does have to potential to pass some UTF-8 input through to display functions and such. I'm not sure what sort of risk that poses though.

    Anyway, good to know. Thanks!

  12. #12
    Registered User
    Join Date
    May 2012
    Posts
    505
    Quote Originally Posted by Sir Galahad View Post
    Right. Everybody wants more flexibility, better multilingual support. But UTF-8 just isn't a very good solution. (In fact, it's 💩!) ASCII's not much better, but at least it's straightforward to process.

    I guess I just need to weigh the risks. The context I'll be using it is fairly passive, but it does have to potential to pass some UTF-8 input through to display functions and such. I'm not sure what sort of risk that poses though.

    Anyway, good to know. Thanks!
    UTF-8 is backwards compatible with ascii, which is a huge advantage. UTF-16 has the disadvantage that it can't represent some code points, whilst UTF-32 is rather extravagant if most of the text is English. Another issue with multi-byte encoding is that no-one has settled on an endiannness standard for files.

    So you'll find that UTF-8 is used a lot. Most functions treat strings as atomic, so the string encoding doesn't really matter. Where you do need to access individual characters, UTF-8 is a bit more awkward to use than other encodings. That's typically just before converting a character to a glyph, or for passing to or from an operating system function, or for simple text formatting.
    I'm the author of MiniBasic: How to write a script interpreter and Basic Algorithms
    Visit my website for lots of associated C programming resources.
    https://github.com/MalcolmMcLean


  13. #13
    Registered User
    Join Date
    Dec 2017
    Posts
    1,633
    An addition to the 'if/else if' chain to handle another possible error:
    Code:
            if (*utf8 < 0x80)
            else if (*utf8 < 0xC0)
                return false; // out-of-place continuation byte: 10xxxxxx
            else if (*utf8 < 0xE0)
            else if (*utf8 < 0xF0)
            else if (*utf8 < 0xF8)
            else
                return false;
    A little inaccuracy saves tons of explanation. - H.H. Munro

  14. #14
    Registered User Sir Galahad's Avatar
    Join Date
    Nov 2016
    Location
    The Round Table
    Posts
    277
    Quote Originally Posted by john.c View Post
    An addition to the 'if/else if' chain to handle another possible error:
    Code:
            if (*utf8 < 0x80)
            else if (*utf8 < 0xC0)
                return false; // out-of-place continuation byte: 10xxxxxx
            else if (*utf8 < 0xE0)
            else if (*utf8 < 0xF0)
            else if (*utf8 < 0xF8)
            else
                return false;
    That helps! Here's the current version then:

    Code:
    
    #include <locale.h>
    #include <stdbool.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <wchar.h>
    #include <wctype.h>
    
    typedef unsigned utf8_rune;
    typedef unsigned char byte;
    
    void output(utf8_rune glyph) {
      bool is_letter_like = iswalpha(glyph);
      printf("0x%x: %s\n", (unsigned)glyph, is_letter_like ? "true" : "false");
    }
    
    bool output_utf8(const byte* utf8, size_t length) {
      const byte* end = utf8 + length;
      while (utf8 < end) {
        // Code Point Range      Byte 1    Byte 2    Byte 3    Byte 4
        // U+0000  to U+007F     0xxxxxxx
        // U+0080  to U+07FF     110xxxxx  10xxxxxx
        // U+0800  to U+FFFF     1110xxxx  10xxxxxx  10xxxxxx
        // U+10000 to U+10FFFF   11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
        unsigned code = 0;
        if (*utf8 < 0x80)
          code = *utf8++;
        else if (*utf8 < 0xC0)
          return false;  // out-of-place continuation byte: 10xxxxxx
        else if (*utf8 < 0xE0) {
          code = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F);
          utf8 += 2;
        } else if (*utf8 < 0xF0) {
          code =
              ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F);
          utf8 += 3;
        } else if (*utf8 < 0xF8) {
          code = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) |
                 ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F);
          utf8 += 4;
        } else
          return false;
        output(code);
      }
      return true;
    }
    
    int main(int argc, char** argv) {
      setlocale(LC_ALL, "");
      if (argc == 1) {
        char utf8[] =
            "  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,\n"
            "  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς\n"
            "  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ\n"
            "  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿\n"
            "  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ\n"
            "  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν\n"
            "  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,\n"
            "  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν\n"
            "  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον\n"
            "  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι\n"
            "  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν\n"
            "  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους\n"
            "  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ\n"
            "  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ\n"
            "  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς\n"
            "  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.\n"
            "  ŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖŖ\n";
        if (!output_utf8(utf8, sizeof(utf8)))
          puts("Invalid unicode!");
      } else
        for (;;) {
          char* path = *(++argv);
          if (!path)
            break;
          FILE* stream = fopen(path, "rb");
          if (!stream) {
            fprintf(stderr, "Error: cannot process file '%s'\n", path);
            continue;
          }
          fseek(stream, 0, SEEK_END);
          size_t size = ftell(stream);
          rewind(stream);
          char* utf8 = malloc(size + 1);
          fread(utf8, 1, size, stream);
          utf8[size] = 0;
          fclose(stream);
          if (!output_utf8(utf8, size))
            puts("Invalid unicode!");
          free(utf8);
        }
      return 0;
    }
    Quote Originally Posted by Malcolm McLean View Post
    UTF-8 is backwards compatible with ascii, which is a huge advantage. UTF-16 has the disadvantage that it can't represent some code points, whilst UTF-32 is rather extravagant if most of the text is English. Another issue with multi-byte encoding is that no-one has settled on an endiannness standard for files.

    So you'll find that UTF-8 is used a lot. Most functions treat strings as atomic, so the string encoding doesn't really matter. Where you do need to access individual characters, UTF-8 is a bit more awkward to use than other encodings. That's typically just before converting a character to a glyph, or for passing to or from an operating system function, or for simple text formatting.
    It is an improvement, compared to previous standards. Maybe they just moved too quickly to standardize it before they had ironed out all of the issues? Well we're stuck with it for now anyway. No use complaining too much about it I guess.

  15. #15
    Registered User Sir Galahad's Avatar
    Join Date
    Nov 2016
    Location
    The Round Table
    Posts
    277
    Whoops, forgot the bounds checking step!

    Code:
    bool output_utf8(const byte* utf8, size_t length) {
      const byte* end = utf8 + length;
      while (utf8 < end) {
        // Code Point Range      Byte 1    Byte 2    Byte 3    Byte 4
        // U+0000  to U+007F     0xxxxxxx
        // U+0080  to U+07FF     110xxxxx  10xxxxxx
        // U+0800  to U+FFFF     1110xxxx  10xxxxxx  10xxxxxx
        // U+10000 to U+10FFFF   11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
        unsigned code = 0;
        if (*utf8 < 0x80)
          code = *utf8++;
        else if (*utf8 < 0xC0)
          return false;  // out-of-place continuation byte: 10xxxxxx
        else if (*utf8 < 0xE0) {
          if ((utf8 + 2) > end)
            return false;
          code = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F);
          utf8 += 2;
        } else if (*utf8 < 0xF0) {
          if ((utf8 + 3) > end)
            return false;
          code =
              ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F);
          utf8 += 3;
        } else if (*utf8 < 0xF8) {
          if ((utf8 + 4) > end)
            return false;
          code = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) |
                 ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F);
          utf8 += 4;
        } else
          return false;
        output(code);
      }
      return true;
    }

Popular pages Recent additions subscribe to a feed

Similar Threads

  1. Replies: 1
    Last Post: 02-23-2010, 08:15 AM
  2. How can i make a "letter could not be found" break?
    By Welshy in forum C++ Programming
    Replies: 14
    Last Post: 04-12-2005, 02:41 PM
  3. Count the number of letter "a" on a sentence
    By imbecile in C in forum C Programming
    Replies: 6
    Last Post: 07-27-2003, 02:32 PM
  4. "itoa"-"_itoa" , "inp"-"_inp", Why some functions have "
    By L.O.K. in forum Windows Programming
    Replies: 5
    Last Post: 12-08-2002, 08:25 AM
  5. "CWnd"-"HWnd","CBitmap"-"HBitmap"...., What is mean by "
    By L.O.K. in forum Windows Programming
    Replies: 2
    Last Post: 12-04-2002, 07:59 AM

Tags for this Thread