buffer[++(*bufptr)]=c;
if (*bufptr >= PARAGRAPH_BUFFER-2) {
buffer[++(*bufptr)]=0;
-/*****************************************************************/
-/* Reading routines for MS-Word, MS-Write and text files */
-/* */
-/* This file is part of catdoc project */
-/* (c) Victor Wagner 1996-2003, (c) Alex Ott 2003 */
-/*****************************************************************/
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <string.h>
-#include <stdio.h>
-#include "catdoc.h"
-unsigned short int buffer[PARAGRAPH_BUFFER];
-static unsigned char read_buf[256];
-static int buf_is_unicode;
-
-/**************************************************************************/
-/* Just prints out content of input file. Called when file is not OLE */
-/* stream */
-/* Parameters - f - file to copy out. header - first few bytes of file, */
-/* which have been already read by format recognition code, but should */
-/* be output anyway */
-/**************************************************************************/
-void copy_out (FILE *f,char *header) {
- char *buf=(char *)buffer;
- int count,i;
- long offset;
- if (get_unicode_char == get_word8_char) {
- /* non-word file and -u specified. Trying to guess which kind of
- * unicode is used
- */
- if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) {
- get_unicode_char = get_utf16msb;
- fputs(convert_char(header[2]<<8|header[3]),stdout);
- fputs(convert_char(header[4]<<8|header[5]),stdout);
- fputs(convert_char(header[6]<<8|header[7]),stdout);
- } else if ((unsigned char)header[0]!=0xFF ||
- (unsigned char)header[1]!=0xFE) {
- int c,j,d;
- /* if it is not utf16, assume it is UTF8. We are told -u,
- * aren't we */
- get_unicode_char = get_utf8;
- i=0;
- while (i<8) {
- c=(unsigned char)header[i++];
- if (c >=0x80) {
- if ( c<0xE0) {
- c=(c & 0x1F);
- count =1;
- } else {
- c=(c & 0xF);
- count = 2;
- }
- for (j=0;j<count;j++) {
- if (i<7) {
- d=(unsigned char) header[i++];
- } else {
- d=fgetc(f);
- }
- c=c<<6 | (d & 0x3F);
- }
- }
- fputs (convert_char(c),stdout);
- }
- } else {
- get_unicode_char = get_utf16lsb;
- fputs(convert_char(header[3]<<8|header[2]),stdout);
- fputs(convert_char(header[5]<<8|header[4]),stdout);
- fputs(convert_char(header[7]<<8|header[6]),stdout);
- }
- while (!catdoc_eof(f)) {
- i=get_unicode_char(f,&offset,0x7FFFFFFF);
- if (i!=EOF) fputs(convert_char(i),stdout);
- }
- } else {
- for (i=0;i<8;i++) {
- fputs(convert_char(to_unicode(source_charset,(unsigned char)header[i])),stdout);
- }
- /* Assuming 8-bit input text */
- while ((count = catdoc_read(buf,1,PARAGRAPH_BUFFER,f))) {
- for (i=0;i<count;i++) {
- fputs(convert_char(to_unicode(source_charset,
- (unsigned char)buf[i])),stdout);
- }
- }
- }
-}
-/**************************************************************************/
-/* process_file - main process engine. Reads word file using function, */
-/* pointed by get_unicode_char, searches for things which looks like */
-/* paragraphs and print them out */
-/**************************************************************************/
-int process_file(FILE *f,long stop) {
- int bufptr;
- int tabmode=0;
- long offset=0;
- int hyperlink_mode = 0;
- unsigned short c;
- /* Now we are starting to read with get_unicode_char */
- while (!catdoc_eof(f) && offset<stop) {
- bufptr = -1;
- do {
- c=get_unicode_char(f,&offset,stop);
- /* Following symbols below 32 are allowed inside paragraph:
- 0x0002 - footnote mark
- 0x0007 - table separator (converted to tabmode)
- 0x0009 - Horizontal tab ( printed as is)
- 0x000B - hard return
- 0x000C - page break
- 0x000D - return - marks an end of paragraph
- 0x001E - IS2 for some reason means short defis in Word.
- 0x001F - soft hyphen in Word
- 0x0013 - start embedded hyperlink
- 0x0014 - separate hyperlink URL from text
- 0x0015 - end embedded hyperlink
- */
- if (tabmode) {
- tabmode=0;
- if (c==0x007) {
- buffer[++bufptr]=0x1E;
- continue;
- } else {
- buffer[++bufptr]=0x1C;
- }
- }
- if (c<32) {
- switch (c) {
- case 0x007:
- tabmode = 1;
- break;
- case 0x000D:
- case 0x000B:
- buffer[++bufptr]=0x000A;
- break;
- case 0x000C:
- buffer[++bufptr]=c;
- break;
- case 0x001E:
- buffer[++bufptr]='-';
- break;
- case 0x0002: break;
-
- case 0x001F:
- buffer[++bufptr]=0xAD;/* translate to Unicode
- soft hyphen */
- break;
- case 0x0009:
- buffer[++bufptr]=c;
- break;
- case 0x0013:
- hyperlink_mode=1;
- buffer[++bufptr]=' ';
- break;
- case 0x0014:
- hyperlink_mode = 0;
- /*fall through */
- case 0x0015:
- /* just treat hyperlink separators as
- * space */
- buffer[++bufptr]=' ';
- break;
- case 0x0001: if (hyperlink_mode)
- break;
- /* else fall through */
- default:
- bufptr=-1; /* Any other control char - discard para*/
- }
- } else if (c != 0xfeff) {
- /* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything
- * else*/
- buffer[++bufptr]=c;
- }
- } while (bufptr<=PARAGRAPH_BUFFER-2 &&
- !catdoc_eof(f) &&
- buffer[bufptr]!=0x000a);
- if (bufptr>0) {
- buffer[++bufptr]=0;
- output_paragraph(buffer);
- }
- }
- return 0;
-}
-/**********************************************************************/
-/* Reads file from MS-Word 97 and above file. Takes in account strange*
- * situation that unicode and non-unicode 256-byte blocks could be *
- * intermixed in word file *
- * *
- * Parameters: *
- * *
- * f - file to read *
- * offset - position of the character inside file (to determine *
- * possible block boundaries *
- **********************************************************************/
-int get_word8_char(FILE *f,long *offset,long fileend) {
- int count,i,u;
- char c;
- if ((i=(*offset)%256) ==0) {
- count=catdoc_read(read_buf,1,256,f);
- memset(read_buf+count,0,256-count);
- buf_is_unicode=0;
- if (*offset+(long)count>fileend) {
- count=fileend-*offset;
- }
- while (i<count) {
- c=read_buf[i++];
- if ((c==0x20|| c==0x0D||ispunct(c))&&i<count&&read_buf[i]==0) {
- buf_is_unicode=1;
- break;
- }
- i++;
- }
- i=0;
- }
- if (buf_is_unicode) {
- u=read_buf[i] | read_buf[i+1]<<8;
- (*offset)+=2;
- } else {
- u=to_unicode(source_charset,read_buf[i]);
- (*offset)++;
- }
- return u;
-}
-
-
output_paragraph(buffer);
*bufptr=-1;
}