I have to pull the text out of a word doc. I know a small bit of C, but I'm not sure I know enough to do this. Regardless, I have a C++ dll that another guy here at work wrote from my VB version and it does strip out the text. However, it also strips out the formatting characters that are alphanumeric. This will not work for the situation I need it for. Does anyone know how to find the formatting characters in a word doc when stripping the text out so that it will ignore the formatting characters? Here's the code to the dll that I'm using right now. Any ideas or suggestions would be greatly appreciated. Thanks.
Code:
#include <windows.h>
#include <stdio.h>
#include <iostream.h>
#include <string.h>
#include <fstream.h>
extern "C" LPWSTR __stdcall ConvertDocument(const char* pPath)
{
long i;
char ch;
//char oStr[100000];
LPWSTR bsText;
//WCHAR wszText[200000];
CHAR oStr[200000];
ifstream tfile(pPath, ios::binary | ios::nocreate );
if( tfile ) {
i = 0;
while ( (tfile.good()) && (i <= 199999) ) { // EOF or failure stops the reading
tfile.get( ch );
if((ch >= 'A')&&(ch <= 'Z')){
oStr[i] = ch;
i++;
}
if((ch >= 'a')&&(ch <= 'z')){
oStr[i] = ch;
i++;
}
if((ch >= '0')&&(ch <= '9')){
oStr[i] = ch;
i++;
}
if(ch == 13){
oStr[i] = 13;
i++;
oStr[i] = 10;
i++;
}
if((ch == ' ')||(ch == '\t')){
oStr[i] = ' ';
i++;
}
if((ch == '.')||(ch == '?')||(ch == '!')||(ch == ';')||(ch == '(')||(ch == ')')||(ch == '{')||(ch == '}')||(ch == '[')||(ch == ']')||(ch == '`')||(ch == ':')||(ch == 39)){
oStr[i] = ' ';
i++;
}
}
tfile.close(); // No need for this really, ~ofstream kills the file
oStr[i] = '\0';
}
else {
cout << "ERROR: Cannot open file." << endl;
oStr[0] = '\0';
}
bsText = SysAllocString((LPWSTR)&oStr);
return bsText;
}