1 /*****************************************************************/
2 /* BIFF-stream (excel file) parsing */
4 /* This file is part of catdoc project */
5 /* (c) David Rysdam 1998 */
6 /* (c) Victor Wagner 1998-2003, (c) Alex Ott 2003 */
7 /*****************************************************************/
20 #include "../compat/strftime.h"
22 static unsigned char rec[MAX_MS_RECSIZE];
24 short int *formatTable=NULL;
25 char *forced_date_format = NULL;
26 size_t formatTableIndex = 0;
27 size_t formatTableSize = 0;
28 double date_shift = 25569.0;
29 #define FLT_FORMAT(a,b,c) a #b c
30 #define MK_FORMAT(x) FLT_FORMAT("%.",x,"g")
31 char number_format[8]=MK_FORMAT(DBL_DIG);
33 void CleanUpFormatIdxUsed(void);
35 void do_table(FILE *input,char *filename) {
37 long reclen,build_year=0,build_rel=0,offset=0;
40 date_shift=25569.0; /* Windows 1900 date system */
41 CleanUpFormatIdxUsed();
43 catdoc_read(rec,2,1,input);
44 biff_version=getshort(rec,0);
45 catdoc_read(rec,2,1,input);
46 reclen=getshort(rec,0);
47 if ( biff_version == 0x0809 || biff_version == 0x0409 ||
48 biff_version == 0x0209 || biff_version == 0x0009 ) {
49 if (reclen==8 || reclen==16) {
50 if (biff_version == 0x0809 ) {
51 itemsread=catdoc_read(rec,4,1,input);
54 build_year=getshort(rec+2,0);
55 build_rel=getshort(rec,0);
58 catdoc_read(rec,8,1,input);
66 } else if (biff_version == 0x0209 ) {
69 } else if (biff_version == 0x0409 ) {
75 itemsread=catdoc_read(rec,reclen-offset,1,input);
78 fprintf(stderr,"%s: Invalid BOF record\n",filename);
82 itemsread=catdoc_read(rec,126,1,input);
85 if (catdoc_eof(input)) {
86 fprintf(stderr,"%s: No BOF record found\n",filename);
90 unsigned char buffer[2];
92 itemsread = catdoc_read(buffer, 2, 1, input);
93 if (catdoc_eof(input)) {
94 process_item(MSEOF,0,NULL);
101 rectype=getshort(buffer,0);
102 itemsread = catdoc_read(buffer, 2, 1, input);
105 reclen=getshort(buffer,0);
106 if (reclen && reclen <MAX_MS_RECSIZE &&reclen >0){
107 itemsread = catdoc_read(rec, 1, reclen, input);
111 if (rectype != BOF) {
115 /* fprintf(stderr,"Rectype 0x%04X reclen=%d\n",rectype, reclen); */
116 process_item(rectype,reclen,rec);
117 if (rectype == MSEOF) {
125 unsigned char **sst=NULL;/* Shared string table parsed into array of strings in
127 int sstsize = 0; /*Number of strings in SST*/
128 unsigned char *sstBuffer=NULL; /*Unparsed sst to accumulate all its parts*/
129 int sstBytes = 0; /*Size of SST Data, already accumulated in the buffer */
130 int codepage=1251; /*default*/
132 /* holds a pointer to formula value, becouse value itself would be in
135 unsigned char **saved_reference = NULL;
137 void process_item (int rectype, int reclen, unsigned char *rec) {
138 if (rectype != CONTINUE && prev_rectype == SST) {
139 /* we have accumulated unparsed SST, and now encountered
140 * another record, which indicates that SST is ended */
141 /* fprintf(stderr,"parse sst!\n");*/
142 parse_sst(sstBuffer,sstBytes);
146 fprintf(stderr,"File is encrypted\n");
151 /* File is write protected, but we only read it */
154 if (source_charset) break;
155 codepage=getshort(rec,0);
156 /*fprintf(stderr,"CODEPAGE %d\n",codepage); */
157 if (codepage!=1200) {
158 const char *cp = charset_from_codepage(codepage);
159 source_charset=read_charset(cp);
165 format_code=getshort(rec,0);
166 SetFormatIdxUsed(format_code);
167 /* this debug code prints format string */
171 fprintf(stderr,"Format %x \"",format_code);
172 if (rec[2] == reclen - 3 && rec[3] != 0) {
173 for (i=0,ptr=rec+3;i<rec[2];i++,ptr++) {
177 for (i=0,ptr=rec+5;i<rec[2];i++,ptr+=2) {
181 fprintf (stderr,"\"\n");
186 /* Just copy SST into buffer, and wait until we get
187 * all CONTINUE records
189 /* fprintf(stderr,"SST\n"); */
190 /* If exists first SST entry, then just drop it and start new*/
191 if (sstBuffer != NULL)
196 sstBuffer=(unsigned char*)malloc(reclen);
198 if (sstBuffer == NULL ) {
199 perror("SSTptr alloc error! ");
202 memcpy(sstBuffer,rec,reclen);
206 if (prev_rectype != SST) {
207 return; /* to avoid changing of prev_rectype;*/
209 sstBuffer=realloc(sstBuffer,sstBytes+reclen);
210 if (sstBuffer == NULL ) {
211 perror("SSTptr realloc error! ");
214 memcpy(sstBuffer+sstBytes,rec,reclen);
220 unsigned char **pcell;
221 unsigned char *src=(unsigned char *)rec+6;
223 saved_reference=NULL;
224 row = getshort(rec,0);
225 col = getshort(rec,2);
226 /* fprintf(stderr,"LABEL!\n"); */
227 pcell=allocate(row,col);
228 *pcell=copy_unicode_string(&src);
231 case BLANK: { int row,col;unsigned char **pcell;
232 row = getshort(rec,0);
233 col = getshort(rec,2);
234 pcell=allocate(row,col);
239 int row, startcol,endcol;
240 unsigned char **pcell;
241 row = getshort(rec,0);
242 startcol = getshort(rec,2);
243 endcol=getshort(rec,reclen-2);
244 pcell=allocate(row,endcol);
249 case CONSTANT_STRING: {
250 int row = getshort(rec,0);
251 int col = getshort(rec,2);
252 unsigned char **pcell;
253 int string_no=getshort(rec,6);
255 fprintf(stderr,"CONSTANT_STRING before SST parsed\n");
258 /* fprintf(stderr,"col=%d row=%d no=%d\n",col,row,string_no); */
260 saved_reference=NULL;
261 pcell=allocate(row,col);
262 if (string_no>=sstsize|| string_no < 0 ) {
263 fprintf(stderr,"string index out of boundary\n");
265 } else if (sst[string_no] !=NULL) {
267 unsigned char *outptr;
268 len=strlen((char *)sst[string_no]);
269 outptr=*pcell=malloc(len+1);
270 strcpy((char *)outptr,(char *)sst[string_no]);
282 unsigned char **pcell;
284 saved_reference=NULL;
285 row = getshort(rec,0)-startrow;
286 col = getshort(rec,2);
287 pcell=allocate(row,col);
288 *pcell=(unsigned char *)strdup(format_double(rec,6,getshort(rec,4)));
293 unsigned char **pcell;
295 row = getshort(rec,0)-startrow;
296 col = getshort(rec,2);
297 pcell=allocate(row,col);
298 *pcell=(unsigned char *)strdup(format_int(getshort(rec,7),getshort(rec,4)));
303 int row,col,format_code;
304 unsigned char **pcell;
306 saved_reference=NULL;
307 row = getshort(rec,0)-startrow;
308 col = getshort(rec,2);
309 pcell=allocate(row,col);
310 format_code = getshort(rec,4);
311 *pcell=(unsigned char *)strdup(format_rk(rec+6,format_code));
315 int row,col,startcol,endcol,offset,format_code;
316 unsigned char **pcell;
317 row = getshort(rec,0)-startrow;
318 startcol = getshort(rec,2);
319 endcol = getshort(rec,reclen-2);
320 saved_reference=NULL;
322 for (offset=4,col=startcol;col<=endcol;offset+=6,col++) {
323 pcell=allocate(row,col);
324 format_code=getshort(rec,offset);
325 *pcell=(unsigned char *)strdup(format_rk(rec+offset+2,format_code));
332 unsigned char **pcell;
333 saved_reference=NULL;
334 row = getshort(rec,0)-startrow;
335 col = getshort(rec,2);
336 pcell=allocate(row,col);
337 if (((unsigned char)rec[12]==0xFF)&&(unsigned char)rec[13]==0xFF) {
338 /* not a floating point value */
343 *pcell=(unsigned char *)strdup(buf);
344 } else if (rec[6]==2) {
347 *pcell=(unsigned char *)strdup(buf);
348 } else if (rec[6]==0) {
349 saved_reference=pcell;
352 int format_code=getshort(rec,4);
353 *pcell=(unsigned char *)strdup(format_double(rec,6,format_code));
358 unsigned char *src=(unsigned char *)rec;
359 if (!saved_reference) {
360 fprintf(stderr,"String record without preceeding string formula\n");
363 *saved_reference=copy_unicode_string(&src);
368 fprintf(stderr,"BOF when current sheet is not flushed\n");
374 case 0x43: /*from perl module Spreadsheet::ParseExecel */
376 short int formatIndex = getshort(rec,2);
377 /* we are interested only in format index here */
378 if (formatTableIndex >= formatTableSize) {
379 formatTable=realloc(formatTable,
380 (formatTableSize+=16)*sizeof(short int));
383 fprintf(stderr,"Out of memory for format table");
387 formatTable[formatTableIndex++] = formatIndex;
390 case MS1904: /* Macintosh 1904 date system */
402 /* fprintf(stderr,"Row! %d %d %d\n",getshort(rec,0), getshort(rec+2,0),getshort(rec+4,0)); */
406 /* fprintf(stderr,"INDEX! %d %d\n", getlong(rec+4,0), getlong(rec+8,0)); */
411 fprintf(stderr,"Unknown record 0x%x\n length %d\n",rectype,reclen);
415 prev_rectype=rectype;
419 * Extracts string from sst and returns mallocked copy of it
421 unsigned char *copy_unicode_string (unsigned char **src) {
425 int to_skip=0; /* Used to counmt data after end of string */
426 int offset = 1; /* Variable length of the first field */
428 /* char *realstart=*src; */
429 unsigned char *dest;/* where to copy string */
430 unsigned char *s,*d,*c;
434 /* for(i=0;i<20;i++) */
435 /* fprintf(stderr,"%02x ",(*src)[i]); */
436 /* fprintf(stderr,"\n"); */
438 flags = *(*src+1+offset);
439 if (! ( flags == 0 || flags == 1 || flags == 8 || flags == 9 ||
440 flags == 4 || flags == 5 || flags == 0x0c || flags == 0x0d ) ) {
442 flags = *(*src+offset);
444 if (! ( flags == 0 || flags == 1 || flags == 8 || flags == 9 ||
445 flags == 4 || flags == 5 || flags == 0x0c || flags == 0x0d ) ) {
446 /* fprintf(stderr,"Strange flags = %d, returning NULL\n", flags); */
451 count=getshort(*src,0);
453 charsize=(flags &0x01) ? 2 : 1;
455 switch (flags & 12 ) {
456 case 0x0c: /* Far East with RichText formating */
457 to_skip=4*getshort(*src,2+offset)+getlong(*src, 4+offset);
458 start_offset=2+offset+2+4;
459 /* fprintf(stderr,"Far East with RichText formating\n"); */
462 case 0x08: /* With RichText formating */
463 to_skip=4*getshort(*src,2+offset);
464 start_offset=2+offset+2;
465 /* fprintf(stderr,"With RichText formating %d\n",getshort(*src,2+offset)); */
468 case 0x04: /* Far East */
469 to_skip=getlong(*src, 2+offset);
470 start_offset=2+offset+4;
471 /* fprintf(stderr,"Far East\n"); */
476 start_offset=2+offset;
477 /* fprintf(stderr,"Default string\n"); */
480 /* fprintf(stderr,"count=%d skip=%d start_offset=%d\n", */
481 /* count, to_skip, start_offset); */
482 /* Á ÚÄÅÓØ ÍÙ ËÏÐÉÒÕÅÍ ÓÔÒÏËÕ */
483 if ( (dest=malloc(count+1)) == NULL ) {
484 perror("Dest string alloc error");
485 *src+=(to_skip+start_offset+(count*charsize));
491 for (s=*src,d=dest,i=0;i<count;i++,s+=charsize) {
492 /* fprintf(stderr,"l=%d len=%d count=%d charsize=%d\n",l,len,count,charsize); */
493 if ( (charsize == 1 && (*s == 1 || *s == 0)) ||
494 (charsize == 2 && (*s == 1 || *s == 0) && *(s+1) != 4)) {
495 /* fprintf(stderr,"extchar (unicode)=%02x %02x\n",*s, *(s+1)); */
496 charsize=(*s &0x01) ? 2 : 1;
502 if ( charsize == 2 ){
503 u=(unsigned short)getshort(s,0);
504 c=(unsigned char *)convert_char(u);
505 /* fprintf(stderr,"char=%02x %02x\n", *s, *(s+1)); */
507 if (!source_charset) {
508 check_charset(&source_csname,source_csname);
509 /* fprintf(stderr,"charset=%s\n",source_csname);*/
510 source_charset=read_charset(source_csname);
512 u=(unsigned short)to_unicode(source_charset,(unsigned char)*s);
513 c=(unsigned char *)convert_char(u);
516 int dl = strlen((char *)c);
519 dest=realloc(dest,len+1);
522 strcpy((char *)d,(char *)c);
532 * Format code is index into format table (which is list of XF records
534 * Second word of XF record is format type idnex
535 * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes
536 * date if it is not used for explicitly stored formats.
537 * BuiltInDateFormatIdx converts format index into index of explicit
538 * built-in date formats sutable for strftime.
540 int BuiltInDateFormatIdx (int index) {
542 offset=1; /* offset of date formats */
543 /* 0 is used as false -- format not found */
544 if ((index>= 0x0E) && (index<=0x16)) {
545 return offset+index-0x0E;
547 if ((index>=0x2d) && (index<=0x2F)) {
548 return offset+index-0x2d+9;
556 * GetBuiltInDateFormat stores and returns
557 * built in xls2csv strftime formats.
559 #define NUMOFDATEFORMATS 12
560 char *GetBuiltInDateFormat(int dateindex) {
561 static char *formats[]={
562 /* reserved */ NULL, /* BuiltInDateFormatIdx use dateindex=0 as flag format not found */
563 /* 0x0E */ "%m-%d-%y", /* 01 */
564 /* 0x0F */ "%d-%b-%y", /* 02 */
565 /* 0x10 */ "%d-%b", /* 03 */
566 /* 0x11 */ "%b-%d", /* 04 */
567 /* 0x12 */ "%l:%M %p", /* 05 */
568 /* 0x13 */ "%l:%M:%S %p", /* 06 */
569 /* 0x14 */ "%H:%M", /* 07 */
570 /* 0x15 */ "%H:%M:%S", /* 08 */
571 /* 0x16 */ "%m-%d-%y %H:%M", /* 09 */
572 /* 0x2d */ "%M:%S", /* 10 */
573 /* 0x2e */ "%H:%M:%S", /* 11 */
574 /* 0x2f */ "%M:%S", /* 12 */
576 /* 0xa4 */ "%m.%d.%Y %l:%M:%S %p" /* 13 */
579 if (dateindex>0 && dateindex <= NUMOFDATEFORMATS) {
580 return formats[dateindex];
585 static char FormatIdxUsed[NUMOFDATEFORMATS];
587 void CleanUpFormatIdxUsed() {
589 for (i=0;i<NUMOFDATEFORMATS; i++)
594 * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes
595 * date in case when they are built-in Excel97 formats.
596 * Nevertheless, those indexes can be used for explicitly stored formats,
597 * which are not dates in general.
598 * SetFormatIdxUsed marks this formats as already used
599 * and excludes them from list of built-in formats
600 * preventing misformatting of corresponding data.
602 void SetFormatIdxUsed(int format_code) {
604 /*fprintf(stderr,"Format idx %x to be set to dirty\n",format_code);
606 dateindex=BuiltInDateFormatIdx(format_code);
608 FormatIdxUsed[dateindex]=1;
609 /*fprintf(stderr,"Date idx %d is set to be dirty\n",dateindex); */
614 * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes
615 * date in case when they are built-in Excel97 formats.
616 * Nevertheless, those indexes can be used for explicitly stored formats,
617 * which are not dates in general.
618 * SetFormatIdxUsed marks this formats as already used
619 * and excludes them from list of built-in formats
620 * preventing misformatting of corresponding data.
621 * IsFormatIdxUsed tests this case.
623 char IsFormatIdxUsed(int format_code) {
625 dateindex=BuiltInDateFormatIdx(format_code);
627 /* fprintf(stderr,"Date idx %d is dirty\n",dateindex); */
628 return FormatIdxUsed[dateindex]==1;
634 /* Checks if format denoted by given code is date
635 * Format code is index into format table (which is list of XF records
637 * Second word of XF record is format type inex
638 * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes
640 * If so, it returns strftime format for this date. Otherwise returns
645 char *isDateFormat(int format_code) {
648 if (format_code>=formatTableIndex) {
649 fprintf(stderr,"Format code %d is used before definition\n",format_code);
653 index = formatTable[format_code];
654 if (IsFormatIdxUsed(index)) {
655 fprintf(stderr,"Format %x is redefined\n",index);
656 /* this format is something user-defined --- not a standard built-in date*/
659 dateindex=BuiltInDateFormatIdx(index);
661 if (forced_date_format) return forced_date_format;
662 return GetBuiltInDateFormat(dateindex);
669 time_t float2date(double d);
671 * Extracts floating point value and formats it
674 char *number2string(double d,short int format_code) {
675 static char buffer [128];
677 if ((datefmt=isDateFormat(format_code))!=NULL) {
678 time_t t = float2date(d);
679 strftime(buffer, 127,datefmt, gmtime(&t));
681 sprintf(buffer,number_format,d);
686 char *format_double(unsigned char *rec,int offset,int format_code) {
687 union { unsigned char cc[8];
691 # ifdef WORDS_BIGENDIAN
692 for(s=rec+offset+8,d=dconv.cc,i=0;
693 i<8;i++) *(d++)=*(--s);
695 for(s=rec+offset,d=dconv.cc,i=0;
696 i<8;i++) *(d++)=*(s++);
698 return number2string(dconv.d,format_code);
702 * Formats integer value into static buffer
704 char *format_int(int value,int format_code) {
705 static char buffer[12];
706 sprintf(buffer,"%i",value);
712 char* format_rk(unsigned char *rec,short int format_code) {
718 value=(double)(getlong(rec,0)>>2);
721 union { unsigned char cc[8];
726 # ifdef WORDS_BIGENDIAN
727 for(s=rec+4,d=dconv.cc,i=0; i<4;i++)
729 dconv.cc[3]=dconv.cc[3] & 0xfc;
731 for(s=rec,d=dconv.cc+4,i=0;
732 i<4;i++) *(d++)=*(s++);
733 dconv.cc[4]=dconv.cc[4] & 0xfc;
739 return number2string(value,format_code);
744 * Converts excel date into time_t
746 time_t float2date(double f) {
747 /* Hacked version. Excell stores date as floating point count of days
748 * since 1.1.1900. or 1.1.1904
749 * We are substracting value of 1.1.1970 and multiplying
750 * by 86400 thus getting seconds from the epoch
752 return rint((f-date_shift)*86400);
756 * Parses SST into array of strings
758 void parse_sst(unsigned char *sstbuf,int bufsize) {
759 int i; /* index into sst */
760 unsigned char *curString; /* pointer into unparsed buffer*/
761 unsigned char *barrier=(unsigned char *)sstbuf+bufsize; /*pointer to end of buffer*/
762 unsigned char **parsedString;/*pointer into parsed array*/
764 sstsize = getlong(sstbuf+4,0);
765 sst=(unsigned char **)malloc(sstsize*sizeof(unsigned char *));
768 perror("SST allocation error");
771 memset(sst,0,sstsize*sizeof(char *));
772 for (i=0,parsedString=sst,curString=sstbuf+8;
773 i<sstsize && curString<barrier; i++,parsedString++) {
774 /* fprintf(stderr,"copying %d string\n",i); */
775 *parsedString = copy_unicode_string(&curString);
777 /* fprintf(stderr,"end sst i=%d sstsize=%d\n",i,sstsize); */