/* $Id: junk_filter.cc,v 1.40 1998/06/14 06:25:36 dps Exp $ */ /* This code filters out almost all the "junk" in word documents and * is useful for extracting the text from word documents. I think this * program produces better signal to noise that catdoc. */ #include "config.h" #include #include #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ #ifdef HAVE_STRINGS_H #include #endif /* HAVE_STRINGS_H */ #ifdef HAVE_CTYPE_H #include #endif /* HAVE_CTYPE_H */ #include #include #include #include "strip.h" #include "tune.h" /* Note: This is a two stage filter that outputs 0 or 1 characters per input character and thus can use the same buffer safely. */ /* Read more */ int word_junk_filter::__fill(const char *b, int n, int sp, int offset) { int end, nr; if (n>0) memmove(buf+offset, b, n); offset+=n+sp; if (offset>JUNK_FILT_BUFSIZE) { cerr<<"junk_filter::__fill: start offset "<buffer size" "(fatal)\n"; exit(1); } if (offsetread(buf+offset, JUNK_FILT_BUFSIZE-offset); nr=is->gcount(); } else nr=0; pos+=nr; end=nr+n; return end; } /* Junk filtering. Buffer size must be >=RESUME_CHARS */ int word_junk_filter::filter_junk(const char *b, int ns) { int i, d, n, top_run; if ((n=__fill(b, ns, junk_end_usage, 1))==0) return 0; if (junk_end_usage>0) memcpy(buf+ns+1, junk_end_buf, junk_end_usage); if (init==INIT_SKIPPED) { init=INIT_DONE; // Finished init ns=0; // Process all the data text_size=0; // Text size is zero } #ifdef DEBUG_FILTER_FILL printf("\n---refill ns=%d junk_end_usage=%d\n", ns, junk_end_usage); printf("---end=%d mode=%d\n", n, mode); #endif /* Correctness of this loop depends on d<=i at the top of the loop. The claims * by inductioon about the fake variable __adj__ suffice to prove this. All the relevant commments are prefixed by CS */ top_run=0; /*** CS: __adj__=1 ****/ for (i=ns+1, d=ns; i<=n; i++) // Ignore already processed data pushed back { /*** CS: i==d+junk_end_usage+__adj__ and __adj__>=0 here by induction ***/ /*** CS: if mode==SKIP_JUNK or SKIP_JUNK_WASPRN then __adj__>=1 ***/ #ifdef DEBUG_FILTER printf("mode %d, char[%i] (t=%d) %c, asc=%d\n", mode, i, n, buf[i], junk_end_usage); #endif switch(mode) { case NORMAL: /* Normal mode. OK becasue change in d at most 1 and junk_end_usage=0 */ if (buf[i]==0) { junk_end_usage=0; junk_size=0; mode=SKIP_JUNK; break; } if (buf[i] & 0x80) { /*** CS: by induction junk_end_usage==top_run ***/ junk_end_buf[junk_end_usage++]=buf[i]; // Save character top_run++; // Increment top run counter /*** CS: __adj__++ ***/ if (top_run==tune.max_top_run) { junk_size=top_run; // Top bit characters are junk junk_end_usage=0; // Throw away characters mode=SKIP_JUNK; // End junk skipping mode /*** CS: __adj__ += top_run ***/ break; } break; // Exit switch } /* Top bit not set */ if (top_run) { memcpy(buf+d, junk_end_buf, junk_end_usage); // OK by ind hyp. d+=junk_end_usage; junk_end_usage=0; // Ind. hyp now true top_run=0; // Reset top_run counter. } /* Add character */ buf[d++]=buf[i]; text_size++; if (unicode!=__UNI_NO) mode=UNICODE_Z; break; case UNICODE_Z: /* unicode zero mode. OK because change in d is 0 and junk_end_usage=0 */ if (buf[i]==0) { mode=NORMAL; // 0 => Back to printable character mode /*** CS: __adj__++ ***/ } else { #ifdef UNICODE_MARKERS printf("\n*** Mode %d: %d Got %d expected 0\n", mode, UNICODE_Z, buf[i]); #endif junk_end_usage=0; junk_size=0; mode=SKIP_JUNK; /*** CS: __adj__++ ***/ } break; case SKIP_JUNK_WASPRN: /* Skip junk was printable, added for unicode documents */ /* OK because junk_end_usage chages by at most 1 and d stays fixed */ if ((++junk_size)==tune.max_junk && text_size>tune.min_text) { #ifdef DEBUG_SINK_START cerr<<"Enter sink mode\n"; #endif /* DEBUG_SINK_START */ /*** CS: __adj__++ ***/ mode=SINK_JUNK; // Sink junk break; } if (tune.options.unicode_aggresive) { /* Agresive unicode mode: insist on the MSBs appearing */ if (unicode!=__UNI_NO) { if (buf[i]!=0) { /*** CS: __adj__+=junk_end_usage ***/ junk_end_usage=0; } mode=SKIP_JUNK; /*** CS: __adj__++ ***/ break; } } else { /* Non-aggersive unicode mode: simply skip the MSB */ if (buf[i]==0 && unicode!=__UNI_NO) { /*** CS: __adj__++ ***/ break; // Accept zeros } } /* Fall thru */ case SKIP_JUNK: /* Junk skipping mode */ if ((++junk_size)==tune.max_junk && text_size>tune.min_text) { #ifdef DEBUG_SINK_START cerr<<"Enter sink mode\n"; #endif /* DEBUG_SINK_START */ mode=SINK_JUNK; // Sink junk /*** CS: __adj__++ ***/ break; } if (isprint(buf[i]) || buf[i]=='\r' || buf[i]=='\n' || buf[i]=='\t') { #ifdef DEBUG_RESTART printf("---Accepted *%c*, usage=%d\n", buf[i], junk_end_usage); #endif junk_end_buf[junk_end_usage++]=buf[i]; // Save character if (unicode!=__UNI_NO) mode=SKIP_JUNK_WASPRN; // For unicode documents } else { /* Reset junk buffer usage counter */ /*** CS: __adj__+=junk_end_usage+1 ***/ #ifdef DEBUG_RESTART if (junk_end_usage!=0) printf("---Rejected %d, usage=%d\n", buf[i], junk_end_usage); #endif junk_end_usage=0; break; } if (junk_end_usagei occurs */ /* however forutnately this is impossible by the claims that __adj__ */ /* and indcution proves. */ /**** CS: __adj__>=1 here by indcution. Thus i>=d+junk_end_usage+1 ****/ mode=(unicode==__UNI_NO) ? NORMAL : UNICODE_Z; buf[d]=0x7f; // Mark suspect position memcpy(buf+d+1, junk_end_buf, junk_end_usage); // Nothing clobbered by [1] d+=junk_end_usage+1; // Advance d junk_end_usage=0; // Clear junk end buffer /*** CS: __adj__-- ***/ break; case SINK_JUNK: /*** CS: __adj__++ ***/ break; default: cerr<<"word_junk_filter::filter_junk: Invalid mode "<skip_to_start(); end=this->filter_junk(buf, ns); } else return EOF; // If no input stream return EOF } else { end=this->filter_junk(&save, ns); ns=0; } /* Check for EOF */ if (end==0 && is->eof()) return EOF; ns=0; /* Collapse multi-character sequeneces */ for (s=0, d=0, last=-1; s to break; default: break; } last=buf[d]=buf[s]; d++; } if (end!=1 && (buf[d-1]=='\r' || buf[d-1]=='\n')) { d--; ns=1; save=buf[d]; } } while (d==0); setg(buf, buf, buf+d); return(buf[0] & 0xff); } /* Skip to start */ int word_junk_filter::skip_to_start(void) { int zc,ffc,c,n,i,rej,limit=-1; char back[2*ST_ASC_VCHARS_UNICODE+ST_REJ_LIMIT+4]; int bp; long pos=0; enum { FIND_BEGIN, SKIP_ONE, FIND_ASC, FIND_ASC_WASASC, DONE } mode; zc=0; ffc=0; i=0; n=0; bp=0; rej=-1; mode=FIND_BEGIN; while (mode!=DONE) { /* If need a refill */ if (i==n) { /* Fill buffer */ n=this->__fill(back,bp,0,1); if (n==0) return 0; pos+=n-bp; #ifdef DEBUG_SKIP_FILL printf("---skip--- refill bp=%x, pos=%x\n", bp, pos); #endif /* Ignore first saved character */ if (mode==FIND_ASC) mode=SKIP_ONE; i=0; bp=0; } #ifdef DEBUG_SKIP printf("char %c mode %d, c=%d, pos=%x, i=%d l=%d, rej=%d\n", buf[i+1], mode, c, pos++, i, limit, rej); #endif switch(mode) { case FIND_BEGIN: switch(buf[i+1]) { case 0: zc++; ffc=0; break; case ((char) 0xff): ffc++; zc=0; break; default: if (zc>=tune.st_min_zeros || (ffc>=tune.st_min_ff && tune.options.ff_intro)) { unicode=__UNI_PROBE; back[bp++]=buf[i+1]; c=0; rej=0; mode=FIND_ASC_WASASC; // Ignore this character deliberately } else { zc=0; ffc=0; } break; } break; case SKIP_ONE: back[bp++]=buf[i+1]; c=0; rej=0; mode=FIND_ASC_WASASC; break; case FIND_ASC_WASASC: // unicode support if (buf[i+1]==0 && unicode!=__UNI_NO) { if (unicode==__UNI_PROBE) { limit=tune.unicode_st; // Start characters needed /* FIXME: Anyone got a better method of doing this? */ if ((pos+i) % 2==0) { unicode=__UNI_YES_BE; // 0 is first (even) bu#yte } else { unicode=__UNI_YES_LE; // 0 is second (odd) byte } back[bp++]=buf[i+1]; mode=FIND_ASC; // Inist next character not zero #ifdef DEBUG_SKIP_UNI cerr<<"Characters are unicode " <<((unicode==__UNI_YES_LE) ? "litle" : "big") <<" endian\n"; #endif } else back[bp++]=buf[i+1]; c++; break; } else if (unicode==__UNI_PROBE) { #ifdef DEBUG_SKIP_UNI if (unicode==__UNI_PROBE) cerr<<"Characters are not unicode\n"; #endif limit=tune.non_unicode_st; // Start characters needed unicode=__UNI_NO; // Non unicode mode=FIND_ASC; // Finding printable mode } /* fall thru */ case FIND_ASC: if (isprint(buf[i+1]) || buf[i+1]=='\r' || buf[i+1]=='\n' || buf[i+1]=='\t') // If is printable { back[bp++]=buf[i+1]; // Store character c++; // Increase count if (unicode!=__UNI_NO) mode=FIND_ASC_WASASC; // Unicode hack... } else { back[bp++]=buf[i+1]; // Store caharcetr if (buf[i+1]!=0) rej++; // Adjust reject count if (buf[i+1]==0 || rej>tune.st_rej_limit) // 0 or reject limit { bp=0; // Reset buffer pointer unicode=__UNI_PROBE; // Set unicode status to unknonw ffc=(buf[i+1]== (char) 0xff) ? 1 : 0; // reset ff count zc=(buf[i+1]==0) ? 1 : 0; // reset zeros count mode=FIND_BEGIN ; // Find leading characters mode } } if (limit==-1) // Bug catcher { cerr<<"Bug: limit still -1 at point #1\n"; exit(2); } if (c==limit) // IF found enough { #ifdef DEBUG_SKIP printf("***** Enter exit phase ****\n"); #endif mode=DONE; // Finished skipping } break; case DONE: cerr<<"skip_to_start: loop called in done mode\n"; break; default: cerr<<"skip_to_start: Mode invalid\n"; break; } i++; } #ifdef DEBUG_SKIP_MOVE printf("%c%c%c\n", buf[i-bp+1], buf[i-bp+2], buf[i-bp+3]); #endif memmove(buf, buf+i-bp+1, n-i+bp); // Move memory return(n-i+bp); // Return count } /* Option handling stuff for tuning this module. Note that only long options * are supported. If you lack long options these features are not avialable. */ /* set default tuning */ void word_junk_filter::set_dfl_tuning(void) { #ifdef AGGRESIVE_UNICODE tune.options.unicode_aggresive=AGGRESIVE_UNICODE; // Aggresive unicode mode #else tune.options.unicode_aggresive=1; // Aggresive unicode mode #endif /* AGGRESIVE_UNICODE */ tune.resume_chars=RESUME_CHARS; // Resume characters tune.st_min_zeros=ST_MIN_ZEROS; // Minimum zeros tune.st_rej_limit=ST_REJ_LIMIT; // Reject chars tolerance tune.st_min_ff=ST_MIN_FF; // Minimum ff's (alternative to 0s) tune.options.ff_intro=ALLOW_FF_LEADIN; // Enable leading ff's tune.non_unicode_st=ST_ASC_VCHARS; // ok characters leadin tune.unicode_st=ST_ASC_VCHARS_UNICODE; // ok unicode characters tune.min_text=SINK_MIN_TEXT; // text required before sink mode tune.max_junk=SINK_JUNK_CNT; // junk required for sink mode tune.max_top_run=MAX_TOP_RUN; // Max run length of top bits set } int word_junk_filter::do_reset(void) { pos=0; init=INIT_NONE; mode=NORMAL; junk_end_usage=0; ns=0; return 1; // Success }