Questions to Structure - Beginner needs help!

**Fresa** · 03-01-2012

Oh sh***... no, then the words with 25 signs and an umlaut will be filtert... so do I have to start again form step one with fgets/fgetc or something else???

**Adak** · 03-01-2012

Originally Posted by Fresa

Oh sh***... no, then the words with 25 signs and an umlaut will be filtert... so do I have to start again form step one with fgets/fgetc or something else???

Don't panic.

To change the letters to lowercase, if they're uppercase letters, use tolower(), as I mentioned in #6 of my last post.

You can use the same kind of logic to handle umlauts. First part, is identifying the letters that have an umlaut. What is their character (ascii) number, how many of them are there, and what do you want to change them into, when they are found?

And do you have any other special characters that need to be handled, that are not part of English?

You have the right idea, but use it for the umlauts, and use tolower() for the lowercase conversion.

If you don't have an ascii character table yet, you should download one from ascii.table.com or any other site that has them (several do). You can also create one of your own, using your computer. Keep the ascii table handy to help with the logic details to handle the umlauts.

**Fresa** · 03-01-2012

Don't panic.

Haha!

I try my best! :P

Okay, I tried to follow your steps, but I must have a mistake or misunderstanding. At the end of my post I show you my code.
Well... the ascii-table shouldn't be a problem. Yes, there is an "ß" that can be written as an "ss", but the ascii-method I think is much better...

Code:

#include <stdio.h>
#include <string.h>
#include <ctype.h>

struct alist {
    char word[26];
    int count;
};
 
int main(void) 
{
    FILE *fp;
    int i, j, k=2, len, total, totalduplex, wasduplex, entry;
    char buffer[100];
    struct alist list[600];
    
    if((fp=fopen("Demodatei.txt","r+")) == NULL) 
    {
        printf("Error! File did not open - closing program.\n");
        return 1;
    }
    else
    {
        printf("How should the words been list? \n1: alphabetically \n2: by frequency\n");
        scanf("%d",&entry);

        while(k==2)
        {
            switch(entry)
            {
                case 1: k=1; printf("\ntest: alphabetically\n"); break;
                case 2: k=1; printf("\ntest: by frequency\n"); break;
                default: printf("\nWrong input! Please retry.\n"); fflush(stdin); k=2; scanf("%d",&entry);
            }
        }


        printf("\n Word Analysis:\n\n                          Word    Frequency\n");
        printf("  ==========================================\n");
        i = total = wasduplex = totalduplex = 0;

        while((fscanf(fp, "%s", buffer)) > 0 ) 
            {
                len = strlen(buffer);

                if(len>25)
                {
                while(!isalpha(list[i].word[len]))
                    {
                        list[i].word[len--]='\0';
                    }
                }

                strcpy(list[i].word, buffer);

                for(j=0;j<i;j++)
                {
                list[i].word[j] = tolower(list[i].word[j]);
                }

                for(j=0;j<i;j++)
                    {
                        if(!strcmp(list[j].word, list[i].word))
                            {
                                list[j].count++;
                                i--;
                                wasduplex=1;
                                totalduplex++;
                            }
                    }
                
                if(!wasduplex) {list[i].count++;}
                wasduplex=0;
                i++;
                total++;
            }

    fclose(fp);

    for(j=0;j<i;j++) {printf("%4d %26s %4d\n",j+1,list[j].word, list[j].count);}

    j = (total - totalduplex);
    printf("\nTotal Words: %d  Duplicate Words: %d  Unique Words: %d\n",total,totalduplex,j);

    }

    fflush(stdin);
    getchar();
    return 0;
}

Now it pus the special signs out again...

--------------------------

ASCII:
ä: 132
ü: 129
ö: 148
ß: 225

(first tolower, then I haven't to rewrite Ö, Ü, Ä)

**Adak** · 03-01-2012

What are you stuck on?

**Fresa** · 03-01-2012

It doesn't work right. Is the for-loop not correct?

Can you please control the whole steps you told me (abaout tolower)?

**Adak** · 03-01-2012

Our programs are diverging a bit, but I had this:

Code:

#include <stdio.h>
#include <string.h>
#include <ctype.h>

struct alist {
   char word[26];
   int count;
};

int main(void) {
   FILE * fp;
   int i,j,len,totalWords, totalDupes, unique, wasDup;
   struct alist list[600];
   struct alist temp;
   char buffer[100];
   
   if((fp=fopen("Edelweiss.txt","r")) == NULL) {
         printf("Error! File did not open - closing program.\n");
         return 1;
   }
   printf("\n Word Analysis:\n\n                            Word    Frequency\n");
   printf("  ============================================\n");
   i = totalWords = wasDup = totalDupes = 0;
   while((fscanf(fp, "%s", buffer)) > 0) {
      len = strlen(buffer);
      if(len >= 25) 
         continue;
      strcpy(list[i].word, buffer);
      
      for(j=0;j<len;j++) {
         list[i].word[j] = tolower(list[i].word[j]);
      }
      j=0;
      while(!isalpha(list[i].word[len])) {
         list[i].word[len--]= '\0';
         ++j;
      }
      for(j=0;j<i;j++) {
         if(!strcmp(list[j].word, list[i].word)) { //words match
            list[j].count++;                        //add to the tally
            --i;
            wasDup=1;                               //it was a duplicate word
            ++totalDupes;
         }
      }
      if(!wasDup)                                  //it was not a duplicate word
         list[i].count++;
      wasDup=0;

      ++i; 
      ++totalWords;
   }
   fclose(fp);
    
   unique = (totalWords - totalDupes); 
   
   printf("\n Unique Words, in Order of their Appearance:\n\n");
   for(i=0;i<unique;i++) {
      printf("%4d. %26s %4d\n",i+1,list[i].word,list[i].count);
   }
   printf("\n\n Press enter when ready, to see the next page of output\n");
   getchar();
   
   
   printf("\n Sorted Alphabetically:\n\n");
   for(i=0;i<unique-1;i++) {
      for(j=i+1;j<unique;j++) {
         if(strcmp(list[i].word, list[j].word) > 0) {
            temp = list[i];
            list[i] = list[j];
            list[j] = temp;
         }
      }
   }
    
   for(i=0;i<unique;i++)
      printf("%4d. %26s %4d\n",i+1,list[i].word, list[i].count);

   printf("\nTotal Words: %d  Duplicate Words: %d  Unique Words: %d\n",totalWords,totalDupes,unique);
   printf("\n\n Press enter when ready, to see the next page of output\n");
   getchar();
      
   printf("\n Sorted by Word Frequency:\n\n");
   for(i=0;i<unique-1;i++) {
      for(j=i+1;j<unique;j++) {
         if(list[i].count < list[j].count) {
            temp = list[i];
            list[i] = list[j];
            list[j] = temp;
         }
      }
   }
   for(i=0;i<unique;i++)
      printf("%4d. %26s %4d\n",i+1,list[i].word, list[i].count);

   printf("\n");
   return 0;
}

But it does nothing with umlauts. Same general logic can be used though:

right after you get the strlen() of the buffer
either use strchr() to find the umlaut (if present), or use your own for loop to scan through the buffer and see if there is an umlaut char in there.

If you find one, then replace it with the appropriate letter(s).

You're sure that the umlaut MUST be replaced? It's not good when you start replacing letters, so try not to do that.

**Fresa** · 03-01-2012

Wow! You're a genius

I tested your program and I also found some little deficits.
- the umlauts, of course... but this will follow...
- when I test my Demodatei.txt ist counts just 588 words (instead 590 or 591)
- And please create this as es text-file and test it!!!:

test test test two,words why ... why is a backspace in the list and frequency one ps:f-the-word-why test test test

Now I want to rethink your algorithm and then I write again!

THANK YOU!!!

**kevinstrijbos** · 03-01-2012

Yes, your arguments are valid. Reaching an element takes longer indeed, but it takes less resources.
For example: If you have a text file with 500 words and you declare an array with 50 000 structs, there are 49 500 structs too much declared.

**Adak** · 03-01-2012

Originally Posted by Fresa

Wow! You're a genius

I tested your program and I also found some little deficits.
- the umlauts, of course... but this will follow...
- when I test my Demodatei.txt ist counts just 588 words (instead 590 or 591)
- And please create this as es text-file and test it!!!:

Now I want to rethink your algorithm and then I write again!

THANK YOU!!!

Show me the input where it fails - but make it less than 50 words in length. I don't want to spend the time counting words up to more than 50, for now.

The German adaption is something you need to nail down the logic to, by identifying the umlauts, and what you want to do with them. Handling that problem in the buffer string, right after the strlen() line of code, seems best.

Right now, I can't tell you that there even IS a problem with umlauts.

@Kevin:
A 50,000 word array was an example of an array that should be allocated off the heap portion of memory, rather than allocated off the stack portion of memory.

No one is suggesting an allocation of a 50,000 word array, when the input will be only 600 words.

**Fresa** · 03-02-2012

Show me the input where it fails

I wrote, that you could test this line (copy it in a text-file):

test test test two,words why ... why is a backspace in the list and frequency one ps:-of-the-word-why test test test

Or do you mean the output?

And then you will understand my problem

Before I ask you, I will try to resolve the porblem with the umlauts at the weekend on my own.
Anyway I can't watch this thread periodical at this weekend. So we could make a work break on it. Just maybe I will write and then just maybe you will answer, okay?

**Adak** · 03-02-2012

Yes, it's time to sort out the three dot word, and words-connected-with-hyphens.

Would "words-connected-with-hyphens", be one word or four words?

Have a good weekend. Monday is OK.

**Fresa** · 03-05-2012

I hope you had a nice weekend!

"words-connected-with-hyphens"

This should be 4 words.
Now, there are given the special signs, where the strings should be seperated. In my opinion the isalpha-method is much better, but it should also work with

Code:

(list[i].word[len]==' ') || (list[i].word[len]=='.') || (list[i].word[len]=='!') || (list[i].word[len]=='?') || (list[i].word[len]==',') || (list[i].word[len]==';') || (list[i].word[len]=='-') || (list[i].word[len]=='_') || (list[i].word[len]==':') || (list[i].word[len]=='\0') || (list[i].word[len]=='\n') || (list[i].word[len]=='\x0A') || (list[i].word[len]=='\x0D')

But it also doesn't work with the "-" via this method and I don't understand why.
At the weekend I had no time, but now I want to busy oneself with the umlauts.

**Fresa** · 03-05-2012

At first I tried to create a strcuture for the forbidden words, but it doesn't work. Why?:

Code:

#include <stdio.h>
#include <string.h>
#include <ctype.h>
 
struct alist {
    char word[26];
    int count;
};

struct forbiddenlist {
    char forbiddenword[50];
};
  
int main(void)
{
    FILE *fp;
    int i, j, k=2, len, total, totalduplex, wasduplex, entry, forb=0;
    char buffer[100];
    struct alist list[600];
    struct forbiddenlist listf[100];
     
    if((fp=fopen("Edelweiss.txt","r+")) == NULL)
    {
        printf("Error! File did not open - closing program.\n");
        return 1;
    }
    else
    {
        printf("How should the words been list? \n1: alphabetically \n2: by frequency\n");
        scanf("%d",&entry);
 
        while(k==2)
        {
            switch(entry)
            {
                case 1: k=1; printf("\ntest: alphabetically\n"); break;
                case 2: k=1; printf("\ntest: by frequency\n"); break;
                default: printf("\nWrong input! Please retry.\n"); fflush(stdin); k=2; scanf("%d",&entry);
            }
        }
 
 
        printf("\nWord Analysis:\n\n                          Word    Frequency\n");
        printf("  ==========================================\n");
        i = total = wasduplex = totalduplex = 0;
 
        while((fscanf(fp, "%s", buffer)) > 0 )
            {
                len = strlen(buffer);
 
                if(len<=25)
                {
                    strcpy(list[i].word, buffer);

                    for(j=0;j<i;j++)
                    {
                        list[i].word[j] = tolower(list[i].word[j]);
                    }

                    while((list[i].word[len]==' ') || (list[i].word[len]=='.') || (list[i].word[len]=='!') || (list[i].word[len]=='?') || (list[i].word[len]==',') || (list[i].word[len]==';') || (list[i].word[len]=='-') || (list[i].word[len]=='_') || (list[i].word[len]==':') || (list[i].word[len]=='\0') || (list[i].word[len]=='\n') || (list[i].word[len]=='\x0A') || (list[i].word[len]=='\x0D'))
                    {
                        list[i].word[len--]='\0';
                    }
                 
                    for(j=0;j<i;j++)
                    {
                        if(!strcmp(list[j].word, list[i].word))
                        {
                            list[j].count++;
                            i--;
                            wasduplex=1;
                            totalduplex++;
                        }
                    }
                 
                    if(!wasduplex) {list[i].count++;}
                    wasduplex=0;
                    i++;
                    total++;
                }
/*test*/        else
                {
                    forb++;

                    strcpy(listf[i].forbiddenword, buffer);

                    for(j=0;j<i;j++)
                    {
                        listf[i].forbiddenword[j] = tolower(listf[i].forbiddenword[j]);
                    }

                    while((listf[i].forbiddenword[len]==' ') || (listf[i].forbiddenword[len]=='.') || (listf[i].forbiddenword[len]=='!') || (listf[i].forbiddenword[len]=='?') || (listf[i].forbiddenword[len]==',') || (listf[i].forbiddenword[len]==';') || (listf[i].forbiddenword[len]=='-') || (listf[i].forbiddenword[len]=='_') || (listf[i].forbiddenword[len]==':') || (listf[i].forbiddenword[len]=='\0') || (listf[i].forbiddenword[len]=='\n') || (listf[i].forbiddenword[len]=='\x0A') || (listf[i].forbiddenword[len]=='\x0D'))
                    {
                        listf[i].forbiddenword[len--]='\0';
                    }                
                }
/*end of test*/
            }
 
    fclose(fp);
 
    for(j=0;j<i;j++) {printf("%4d %26s %4d\n",j+1,list[j].word, list[j].count);}
 
    j = (total - totalduplex);
    printf("\nTotal Words: %d  Duplicate Words: %d  Unique Words: %d\n",total,totalduplex,j);
    printf("forbidden: %d", forb);

    for(j=0;j<i;j++) {printf("%4d %26s",j+1,listf[j].forbiddenword);}

    }
 
    fflush(stdin);
    getchar();
    return 0;
}

And why isn't the first word transformed tolower?

**Fresa** · 03-05-2012

I can resolve the last problem with this line:

Code:

list[0].word[0] = tolower(list[0].word[0]);

or by changing

Code:

for(j=0;j<i;j++)

into

Code:

for(j=0;j<=i;j++)

----
I also found the cause, why the frequency is just shown in the release-modus correct. I had to set the memory 0.

Code:

for(i=0;i<=600;i++) {list[i].count = 0;}

**Adak** · 03-05-2012

I suggest not having the forbidden words in there. Shouldn't be needed.

This ensures that the ... is included as a word:

Code:

      while(!isalpha(list[i].word[len]) && list[i].word[len-2] != '.') {
         list[i].word[len--]= '\0';
         ++j;
      }

The red part is the only new part.

I'd avoid adding whole new big things to your program. The overall backbone of it should be considered good until shown otherwise. Look for little improvements in the logic, to handle the umlauts, etc.