#!/usr/local/bin/perl
# search_index.cgi - create index file
#
# ICE Version 1.5 beta 3
# May 1998
# (C) Christian Neuss (ice@isa.informatik.th-darmstadt.de)
# Modified by Chris Samaritoni
# Jan 1999
# To index your site simply execute this script as a CGI script.
# For example http://www.yourname.com/cgi-bin/search_index.cgi
# Re-index whenever changes are made to your documents.
$| = 1;
print "Content-type: text/html\n\n";
print "
Indexing Site\n";
print "\n";
print "Indexing site...
\n
\n";
#--- start of configuration --- put your changes here ---
# NOTE: $ENV{'DOCUMENT_ROOT'} contains the full path to your web documents.
# The physical directory/directories to scan for html-files.
# Example:
# @SEARCHDIRS=("$ENV{'DOCUMENT_ROOT'}/docs","$ENV{'DOCUMENT_ROOT'}/articles");
@SEARCHDIRS=("$ENV{'DOCUMENT_ROOT'}");
# The physical directory/directories to NOT scan for html-files. Maybe you have
# a directory inside the SEARCHDIRS that you don't want to be search, such as
# your cgi-bin, postcards or wwwboard.
# Example:
# @NOTSEARCHDIRS=("$ENV{'DOCUMENT_ROOT'}/postcard","$ENV{'DOCUMENT_ROOT'}/wwwboard");
@NOTSEARCHDIRS=("$ENV{'DOCUMENT_ROOT'}/cgi-bin", "$ENV{'DOCUMENT_ROOT'}/postcard");
# Location of the index file. This file contains the list of indexed words.
# Example:
# $INDEXFILE="$ENV{'DOCUMENT_ROOT'}/cgi-bin/index.idx";
$INDEXFILE="$ENV{'DOCUMENT_ROOT'}/cgi-bin/index.idx";
# The ICE indexer will support full international characters by
# converting them to a canonical form if $ISO is set to "y". For
# servers that contain english text only, you can improve indexing
# speed by setting $ISO to "n".
$ISO="n";
# Type of system (for figuring out the path delimiting character)
# that ice-idx.pl runs on. Select one of "UNIX", "MAC", or "PC"
# Important: If you use NT, depending on the Perl binary, the
# correct setting can be eith PC of UNIX!
$TYPE="UNIX";
# Minimum length of word to be indexed
$MINLEN=3;
# Stop indexing a word that appears in over X percent of all files
$MAXPERCENT=60;
# File suffixes to index (regular expression)
$SUFFIXES='\.(rtf|[sp]?html?|txt)$';
#--- end of configuration --- don't change anything below ---
require "find.pl";
local(@allfiles,@tempfiles,%freqlist,$file,%temp,$tempfile);
open(INDEX,">$INDEXFILE") || &error("Cannot open $INDEXFILE: $!\n");
&find (@SEARCHDIRS);
$count=0;
@tempfiles = @allfiles;
@allfiles = ();
%temp = map{$_, 1} @NOTSEARCHDIRS;
OUT: foreach $file (@tempfiles) {
$tempfile = $file;
while($tempfile =~ s#/[^/]+$##) {
next OUT if ($temp{$tempfile});
}
push @allfiles, $file;
}
foreach $name (@allfiles){
print "indexing [$name]\n";
$lastpercent=$percent;
$percent=int(100*$count/@allfiles);
if($percent>$lastpercent){ print $percent,"% ";}
&indexfile($name);
$count++;
# every 100th file until the 1000th...
if((($count % 100) == 0) && ($count >= 200) && ($count < 1200)){
# remove the most frequent words so far from the index
&removefrequent;
}
}
&removefrequent;
# print sorted list of words and their fileids
foreach $w (sort keys(%index)){
print INDEX "$w ",$index{$w},"\n";
}
print INDEX "--\n";
# print list of all files and their fileid
local($dir,$prevdir,$name);
foreach $w (sort keys(%files)){
if($files{$w} =~ m:(.*)/([^/]*)$:){
$prevdir = $dir;
$name = $2;
$dir = $1;
if($prevdir ne $dir){
print INDEX "$dir\n";
}
$title = $titles{$w};
$mtime = $mtimes{$w};
print INDEX "$w $name /$mtime $title\n";
}
}
print "Done!\n
\n";
### system("ps -vx | egrep 'perl|MEM'");
sub wanted {
if($name=~/$SUFFIXES/i){ # file name ends
push(@allfiles,$name);
}
}
# modifies %files
sub removefrequent{
local($num,$tmp);
$numfiles = keys(%files);
foreach $w (keys(%index)){
($tmp = $index{$w}) =~ s/[^ ]//g;
$num = length($tmp);
# don't index words in more then X % of the files
if($num*100 > $MAXPERCENT*$numfiles){
print "removing common word: $w [$num of $numfiles]\n";
$index{$w}="0";
}
}
}
sub indexfile{
local($file)=@_;
local($title,$intitle,$freq);
# PJ - no directories
return if -d $file;
unless (-r $file && open(fpInput,"$file")){ # file readable?
print "cannot read file [$file]\n"; ### XXX no printo
return;
}
$fileno++;
$fileid = sprintf ("%X ",$fileno);
$files{$fileid}=$file;
local($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,@dontcare);
($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,@dontcare) = stat($file);
# strip html tags?
local($ishtml)=0;
local($ishtmlregexp)='\.([sp]?html?)$';
$ishtml=1 if $file=~m!$ishtmlregexp!i;
# set input separator to the tag close character ">"
$/ = ">";
while(){
s/ \;/ /ig;
s/\s+/ /g; # fold whitespaces into a single blank
s/([^\n])$1\n([^\n])/>\n$1/g; # .. and after every '>'
foreach (split(/\n/,$_)){
# opening title tag
if(m::i){
$intitle="y";
$title="";
}
# closing title tag
if(m::i){
$intitle="";
}
# strip spurious tag delimeters
s![<>]! !go if (!($ishtml));
# outside a tag or inside META tag => index word
## PJ: BUG - we also want to index non-html
## so do some guessing to enable this
## (above: try first lines to extract title from ascii)
## (prefer subject: if exists)
if(!/ || //i ) {
$_ = $1;
## print "FOUND META TAG $_\n";
}
if( $ISO eq "y" && /[&\xc0-\xff]/){
# convert html special chars and iso 8bit to text
$_ = &html2text($_);
}
# if inside title
if ($intitle){
tr/a-zA-Z\xc0-\xff0-9\-/ /cs;
$title.="$_";
} else {
# the following line defines what you consider a "printable" char
tr/a-zA-Z\xc0-\xff/ /cs;
foreach (split(/ /,$_)){
next unless (length($_)>=$MINLEN); # if too short skip
if (/\;$/) {
# get rid of trailing ";" that aren't part of &Xuml;
s/((\w|\&[a-z,A-Z]+\;)+)\;?/$1/;
}
if(/^[A-Z][^A-Z]*$/){ # "Someword" to "someword"
tr/A-Z/a-z/;
}
###print "3. [$_]\n";
$freqlist{$_}++;
if(/[A-Z]/) { # store abbr. as all-lower, too
tr/A-Z/a-z/;
$freqlist{$_}++;
}
}
}
}
}
}
$file =~ tr/\n/ /s;
# convert MAC and PC path seperators to UNIX style slashes
if($TYPE eq "MAC"){ $file =~ s|:|/|g; }
if($TYPE eq "PC") { $file =~ s|\\|/|g; }
# on a MAC, add the leading slash
if ($file =~ m/^[^\/]/) { $file = "/$file"; }
$title =~ tr/\n/ /s;
### print INDEX "\@f $file\n";
### print INDEX "\@t $title\n";
### print INDEX "\@m $mtime\n";
foreach $w (sort keys(%freqlist)){
###print INDEX "$freqlist{$w} $w\n";
if($index{$w} ne "0"){
$freq = $freqlist{$w};
$freq .= ":" unless length($freq)==1;
$index{$w} .= $freq.$fileid;
}
### print "4. $freqlist{$w} $w\n";
}
$titles{$fileid}=$title;
$mtimes{$fileid}=$mtime;
undef %freqlist;
close(fpInput);
}
# iso2html - translate iso 8 bit characters to HTML
#
# Thanks to
# Pierre Cormier (cormier.pierre@uqam.ca)
# Universite du Quebec Montreal
sub initTables {
foreach (0..191) { $isohtml[$_] = pack("C",$_);}
$isohtml[hex('c0')] = 'À';
$isohtml[hex('c1')] = 'Á';
$isohtml[hex('c2')] = 'Â';
$isohtml[hex('c3')] = 'Ã';
$isohtml[hex('c4')] = 'Ä';
$isohtml[hex('c5')] = 'Å';
$isohtml[hex('c6')] = 'Æ';
$isohtml[hex('c7')] = 'Ç';
$isohtml[hex('c8')] = 'È';
$isohtml[hex('c9')] = 'É';
$isohtml[hex('ca')] = 'Ê';
$isohtml[hex('cb')] = 'Ë';
$isohtml[hex('cc')] = 'Ì';
$isohtml[hex('cd')] = 'Í';
$isohtml[hex('ce')] = 'Î';
$isohtml[hex('cf')] = 'Ï';
$isohtml[hex('d0')] = 'Ð';
$isohtml[hex('d1')] = 'Ñ';
$isohtml[hex('d2')] = 'Ò';
$isohtml[hex('d3')] = 'Ó';
$isohtml[hex('d4')] = 'Ô';
$isohtml[hex('d5')] = 'Õ';
$isohtml[hex('d6')] = 'Ö';
$isohtml[hex('d7')] = '×';
$isohtml[hex('d8')] = 'Ø';
$isohtml[hex('d9')] = 'Ù';
$isohtml[hex('da')] = 'Ú';
$isohtml[hex('db')] = 'Û';
$isohtml[hex('dc')] = 'Ü';
$isohtml[hex('dd')] = 'Ý';
$isohtml[hex('de')] = 'Þ';
$isohtml[hex('df')] = 'ß';
$isohtml[hex('e0')] = 'à';
$isohtml[hex('e1')] = 'á';
$isohtml[hex('e2')] = 'â';
$isohtml[hex('e3')] = 'ã';
$isohtml[hex('e4')] = 'ä';
$isohtml[hex('e5')] = 'å';
$isohtml[hex('e6')] = 'æ';
$isohtml[hex('e7')] = 'ç';
$isohtml[hex('e8')] = 'è';
$isohtml[hex('e9')] = 'é';
$isohtml[hex('ea')] = 'ê';
$isohtml[hex('eb')] = 'ë';
$isohtml[hex('ec')] = 'ì';
$isohtml[hex('ed')] = 'í';
$isohtml[hex('ee')] = 'î';
$isohtml[hex('ef')] = 'ï';
$isohtml[hex('f0')] = 'ð';
$isohtml[hex('f1')] = 'ñ';
$isohtml[hex('f2')] = 'ò';
$isohtml[hex('f3')] = 'ó';
$isohtml[hex('f4')] = 'ô';
$isohtml[hex('f5')] = 'õ';
$isohtml[hex('f6')] = 'ö';
$isohtml[hex('f7')] = '&DIVIS;';
$isohtml[hex('f8')] = 'ø';
$isohtml[hex('f9')] = 'ù';
$isohtml[hex('fa')] = 'ú';
$isohtml[hex('fb')] = 'û';
$isohtml[hex('fc')] = 'ü';
$isohtml[hex('fd')] = 'ý';
$isohtml[hex('fe')] = 'þ';
$isohtml[hex('ff')] = 'ÿ';
# preset iso2text variable settings
foreach (0..191) { $iso2text[$_] = pack("C",$_);}
foreach (hex('c0')..hex('ff')) {
$iso2text[$_] = substr($isohtml[$_],1,1);
}
# now assign exceptions:
$iso2text[hex('c4')] = 'Ae';
$iso2text[hex('c6')] = 'AE';
$iso2text[hex('d0')] = 'ETH'; # ???
$iso2text[hex('d6')] = 'Oe';
$iso2text[hex('d7')] = 'x';
$iso2text[hex('dc')] = 'Ue';
$iso2text[hex('de')] = 'Th'; # thorn ???
$iso2text[hex('df')] = 'sz';
$iso2text[hex('e4')] = 'ae';
$iso2text[hex('e6')] = 'ae';
$iso2text[hex('f7')] = 'D'; # Divis?
$iso2text[hex('fc')] = 'ue';
$iso2text[hex('fe')] = 'th'; # thorn
# set html2iso variable
foreach (1..255) {
$html2iso{$isohtml[$_]}=pack("C",$_);;
}
}
sub iso2html {
local($input)=@_;
unless(defined($isohtml[0])){
&initTables;
}
local(@car) = split(//,$input);
local($output);
foreach (@car) {
$output .= $isohtml[ord($_)];
}
$output;
}
sub iso2text {
local($input)=@_;
unless(defined($isohtml[0])){
&initTables;
}
local(@car) = split(//,$input);
local($output);
foreach (@car) {
$output .= $iso2text[ord($_)];
}
$output;
}
sub html2iso {
local($input)=@_;
unless(defined($isohtml[0])){
&initTables;
}
local(@car) = split(/;/,$input);
local($output);
foreach (@car) {
if(/(.*)&(.*)/){
$output .= $1;
$output .= $html2iso{"&$2;"};
}else{
$output .= $_;
}
}
$output;
}
sub html2text {
return &iso2text(&html2iso(@_));
}
sub error {
print $_[0];
print "