/*
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
**  long with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**--------------------------------------------------------------------
**
** change sprintf to snprintf to avoid corruption,
** added safestrcpy() macro to avoid corruption from strcpy overflow,
** and use MAXKEYLEN as string length vs. literal "34"
** SRE 11/17/99
**
*/

#include "swish.h"
#include "index.h"
#include "hash.h"
#include "mem.h"
#include "file.h"
#include "string.h"
#include "list.h"
#include "fs.h"
#include "check.h"
#include "error.h"

/* file system specific configuration parameters
**/
static struct swline *pathconlist = 0;
static struct swline *dirconlist = 0;
static struct swline *fileconlist = 0;
static struct swline *titconlist = 0;
static struct swline *fileislist = 0;

/* 06/00 Jose Ruiz
** Moved to swish.h
static struct swline *suffixlist = 0;
*/
/* 06/00 Jose Ruiz
** This entry is duplicated - There is another one in swish-e
static struct swline *nocontentslist = 0; 
*/

#define MAXKEYLEN 34 /* Hash key -- allow for 64 bit inodes */

/* Have we already indexed a file or directory?
** This function is used to avoid multiple index entries
** or endless looping due to symbolic links.
*/

int already_indexed(char *path)
{
#ifndef NO_SYMBOLIC_FILE_LINKS
	static struct dev_ino {
		dev_t  dev;
		ino_t  ino;
		struct dev_ino *next;
	} *inode_hash[BIGHASHSIZE], *p;
	
	struct stat buf;
	char key[MAXKEYLEN];     /* Hash key -- allow for 64 bit inodes */
	unsigned hashval;

	if ( stat( path, &buf ) )
		return 0;
	
	/* Create hash key:  string contains device and inode. */
	snprintf( key, MAXKEYLEN, "%lx/%lx", (unsigned long)buf.st_dev,
		(unsigned long)buf.st_ino  );
	
	hashval = bighash(key); /* Search hash for this file. */
	for ( p = inode_hash[hashval]; p != NULL; p = p->next )
		if ( p->dev == buf.st_dev &&
		p->ino == buf.st_ino    )
	{                               /* We found it. */
		if ( verbose == 3 )
			printf( "Skipping %s:  %s\n",
			path, "Already indexed." );
		return 1;
	}
	
	/* Not found, make new entry. */
	p = (struct dev_ino*)emalloc(sizeof(struct dev_ino));
	p->dev = buf.st_dev;
	p->ino = buf.st_ino;
	p->next = inode_hash[hashval];
	inode_hash[hashval] = p;
#endif
	
	return 0;
}


/* Recursively goes into a directory and calls the word-indexing
** functions for each file that's found.
*/

void indexadir(char *dir)
{
	int badfile;
	DIR *dfd;
#ifdef NEXTSTEP
	struct direct *dp;
#else
	struct dirent *dp;
#endif
	static int lens=0;
	static char *s=NULL;
	static int lentitle=0;
	static char *title=NULL;
	struct docentryarray *sortfilelist, *sortdirlist;
	struct swline *tmplist;
	int ilen1,ilen2;
	
	if(!lens)s=(char *)emalloc((lens=MAXFILELEN) + 1);
	if(!lentitle)title=(char *)emalloc((lentitle=MAXSTRLEN) + 1);
	sortfilelist = sortdirlist = NULL;
	
	if (islink(dir) && !followsymlinks)
		return;
	
	if ( already_indexed(dir) )
		return;
	
	if (dir[strlen(dir) - 1] == '/')
		dir[strlen(dir) - 1] = '\0';
	
	if ((dfd = opendir(dir)) == NULL)
		return;
	
	while ((dp = readdir(dfd)) != NULL && dirconlist != NULL) {
		badfile = 0;
		tmplist = dirconlist;
		while (tmplist != NULL) {
			if (matchARegex(dp->d_name, tmplist->line)) {
				badfile = 1;
				break;
			}
			tmplist = tmplist->next;
		}
		if (badfile)
			return;
	}
	closedir(dfd);
	dfd = opendir(dir);
	
	while ((dp = readdir(dfd)) != NULL) {
		
		if ((dp->d_name)[0] == '.')
			continue;
		if (islink(dp->d_name) && !followsymlinks)
			continue;
		
		badfile = 0;
		tmplist = fileislist;
		while (tmplist != NULL) {
			if (matchARegex(dp->d_name, tmplist->line)) {
				badfile = 1;
				break;
			}
			tmplist = tmplist->next;
		}
		if (badfile)
			continue;
		
		badfile = 0;
		tmplist = fileconlist;
		while (tmplist != NULL) {
			if (matchARegex(dp->d_name, tmplist->line)) {
				badfile = 1;
				break;
			}
			tmplist = tmplist->next;
		}
		if (badfile)
			continue;
		ilen1=strlen(dir);
		ilen2=strlen(dp->d_name);
		if((ilen1 + 1 + ilen2)>=lens) {
			lens=ilen1 + 1 + ilen2 + 200;
			s=(char *)erealloc(s,lens+1);
		}
		memcpy(s,dir,ilen1);
		if(dir[ilen1 - 1] != '/') s[ilen1++]='/';
		memcpy(s+ilen1,dp->d_name,ilen2);
		s[ilen1+ilen2]='\0';
		if (islink(s) && !followsymlinks)
			continue;
		
		badfile = 0;
		tmplist = pathconlist;
		while (tmplist != NULL) {
			if (matchARegex(s, tmplist->line)) {
				badfile = 1;
				break;
			}
			tmplist = tmplist->next;
		}
		if (badfile)
			continue;
		
		if (!isdirectory(s)) {
			
			if ( already_indexed(s) )
				continue;
		
			if (!isoksuffix(dp->d_name, suffixlist))
				continue; 
			
			if (ishtml(s)) {
				title=SafeStrCopy(title, (char *) parsetitle(s, s),&lentitle);
				if (!isoktitle(title))
					continue;
			}
			else {
				if (strrchr(s, '/') != NULL)
					{ title=SafeStrCopy(title, strrchr(s, '/') + 1,&lentitle); }
				else
					{ title=SafeStrCopy(title, s,&lentitle); }
			}
			sortfilelist = (struct docentryarray *)
				addsortentry(sortfilelist, s, title);
		}
		else {
			sortdirlist = (struct docentryarray *)
				addsortentry(sortdirlist, s, s);
		}
	}
	
	closedir(dfd);

	printfiles(sortfilelist);
	printdirs(sortdirlist);
}

/* Calls the word-indexing function for a single file.
*/

void indexafile(char *path)
{
	int badfile;
	char *t;
	static int lentitle=0;
	static char *title=NULL;
	struct docentry *fileentry;
	struct swline *tmplist;
	
	if(!lentitle)title=(char *)emalloc((lentitle=MAXSTRLEN)+1);

	if (islink(path) && !followsymlinks)
		return;
	
	if ( already_indexed(path) )
		return;
	
	if (path[strlen(path) - 1] == '/')
		path[strlen(path) - 1] = '\0';
	
	badfile = 0;
	tmplist = fileislist;
	while (tmplist != NULL) {
		if (!matchARegex(path, tmplist->line)) {
			badfile = 1;
			break;
		}
		tmplist = tmplist->next;
	}
	if (badfile)
		return;
	
	badfile = 0;
	tmplist = fileconlist;
	while (tmplist != NULL) {
		if (matchARegex(path, tmplist->line)) {
			badfile = 1;
			break;
		}
		tmplist = tmplist->next;
	}
	if (badfile)
		return;
	
	badfile = 0;
	tmplist = pathconlist;
	while (tmplist != NULL) {
		if (matchARegex(path, tmplist->line)) {
			badfile = 1;
			break;
		}
		tmplist = tmplist->next;
	}
	if (badfile)
		return;
	
	if (!isoksuffix(path, suffixlist))
		return; 
	
	if (ishtml(path)) {
		title = SafeStrCopy(title, (char *) parsetitle(path, path),&lentitle);
		if (!isoktitle(title))
			return;
	}
	else {
		if ((t = strrchr(path, '/')) != NULL)
			{ title=SafeStrCopy(title, t + 1,&lentitle); }
		else
			{ title=SafeStrCopy(title, path, &lentitle); }
	}
	
	fileentry = (struct docentry *) emalloc(sizeof(struct docentry));
	fileentry->filename = (char *) estrdup(path);
	fileentry->title = (char *) estrdup(title);

	printfile(fileentry);
}

/* Indexes the words in the file
*/

void printfile(struct docentry *e)
{
	int wordcount;
	char *s;
	FILE *fp;
        char *filterprog;
	static int lenfiltercmd=0;
        static char *filtercmd=NULL;
	
	wordcount=-1;

	if(!lenfiltercmd)filtercmd=(char *)emalloc((lenfiltercmd=MAXSTRLEN)+1);

	filterprog = NULL;
	filtercmd[0]=0;
	
	if (e != NULL) {
		if (verbose == 3) {
			if ((s = (char *) strrchr(e->filename, '/')) == NULL)
				printf("  %s", e->filename);
			else
				printf("  %s", s + 1);
			fflush(stdout);
		}

                /*
                  -- $$$
                  -- jeek! simple filter hack!
                  -- simple filter call "filter filename   (decoded ascii output =stdout)"
                  -- if no filter defined, call file-open
                */

                if ((filterprog = hasfilter (e->filename,filterlist)) != NULL) {
			if((int)(strlen(filterprog)+3+strlen(e->filename))>lenfiltercmd) {
				lenfiltercmd=strlen(filterprog)+3+strlen(e->filename)+200;
				filtercmd=erealloc(filtercmd,lenfiltercmd+1);
			}
                        sprintf(filtercmd, "%s \'%s\'",filterprog,e->filename);
                        fp = popen (filtercmd,"r");
#ifdef DEBUG
                        printf ("DBG: FilterOpen: %s ::%p:\n",filtercmd,fp);
#endif
                } else {
                        fp = fopen(e->filename, "r" );
                }

                if (fp  != NULL ) {
			 wordcount = countwords(fp, e->filename, e->title, (isoksuffix(e->filename, nocontentslist) && (nocontentslist != NULL))); 

                        if (filterprog != NULL) pclose(fp);   /* close filter pipe */
                        else fclose (fp);
		}

		if (verbose == 3) {
			if (wordcount > 0)
				printf(" (%d words)\n", wordcount);
			else if(wordcount == 0)
				printf(" (no words)\n");
			else printf(" (not opened)\n");
			fflush(stdout);
		}
		efree(e->filename);
		efree(e->title);
		efree(e);
	}
}

/* Indexes the words in all the files in the array of files
** The array is sorted alphabetically
*/

void printfiles(struct docentryarray *e)
{
int i;
	if(e) {
		for(i=0;i<e->currentsize;i++)
			printfile(e->dlist[i]);
	/* free the array and dlist */
		efree(e->dlist);
		efree(e);
	}	
}

/* Prints out the directory names as things are getting indexed.
** Calls indexadir() so directories in the array are indexed,
** in alphabetical order...
*/

void printdirs(struct docentryarray *e)
{
int i;
	if (e) {
		for(i=0;i<e->currentsize;i++) {
			if (verbose == 3)
				printf("\nIn dir \"%s\":\n", e->dlist[i]->filename);
			else if (verbose == 2)
				printf("Checking dir \"%s\"...\n",e->dlist[i]->filename);
			indexadir(e->dlist[i]->filename);
			efree(e->dlist[i]->filename);
			efree(e->dlist[i]->title);
			efree(e->dlist[i]);
		}
		efree(e->dlist);
		efree(e);
	}
}



/* This checks is a filename has one of the following suffixes:
** "htm", "HTM", "html", "HTML", "shtml", "SHTML".
*/

int ishtml(filename)
char *filename;
{
	char *c;
	static int lensuffix=0;
	static char *suffix=NULL;
	
	if(!lensuffix)suffix=emalloc((lensuffix=MAXSUFFIXLEN)+1);

	c = (char *) strrchr(filename, '.');
	
	if (c == NULL)
		return 0;
	suffix=SafeStrCopy(suffix, c + 1,&lensuffix);
	if (suffix[0] == '\0')
		return 0;
	
	if (!strncmp(suffix, "htm", 3))
		return 1;
	else if (!strncmp(suffix, "HTM", 3))
		return 1;
	else if (!strncmp(suffix, "shtml", 5))
		return 1;
	else if (!strncmp(suffix, "SHTML", 5))
		return 1;
	return 0;
}

/* Check if a particular title should be ignored
** according to the settings in the configuration file.
*/

int isoktitle(title)
char *title;
{
	int badfile;
	struct swline *tmplist;
	
	badfile = 0;
	tmplist = titconlist;
	while (tmplist != NULL) {
		if (matchARegex(title, tmplist->line)) {
			badfile = 1;
			break;
		}
		tmplist = tmplist->next;
	}
	if (badfile)
		return 0;
	else
		return 1;
}

/********************************************************/
/*					"Public" functions					*/
/********************************************************/

void fs_indexpath(char *path)
{
    if (isdirectory(path)) {
		if (verbose >= 2)
			printf("\nChecking dir \"%s\"...\n",
			path);
		indexadir(path);
    }
    else if (isfile(path)) {
		if (verbose >= 2)
			printf("\nChecking file \"%s\"...\n",
			path);
		indexafile(path);
    }
}

int fs_vgetc(void *vp)
{
	return fgetc((FILE *)vp);
}


int fs_vsize(void *vp)
{
	struct stat stbuf;
	return fstat(fileno((FILE *)vp), &stbuf) ? -1 : stbuf.st_size;
}

int fs_vtell(void *vp)
{
	return ftell((FILE *)vp);
}

int fs_vseek(void *vp, long pos)
{
	return fseek((FILE *)vp,pos,0);
}
int fs_parseconfline(char *line)
{
    int rv = 0;
   
/* 06/00 jose Ruiz
** IndexOnly can also be applyed to HTTP
	if (grabCmdOptions(line, "IndexOnly", &suffixlist)) { rv = 1; } 
	else if (lstrstr(line, "FileRules")) 
*/
	if (lstrstr(line, "FileRules")) 
	{
		if (grabCmdOptions(line, "pathname contains", &pathconlist)) { rv = 1; }
		else if (grabCmdOptions(line, "directory contains", &dirconlist)) { rv = 1; }
		else if (grabCmdOptions(line, "filename contains", &fileconlist)) { rv = 1; }
		else if (grabCmdOptions(line, "title contains", &titconlist)) { rv = 1; }
		else if (grabCmdOptions(line, "filename is", &fileislist)) { rv = 1; }
		else if (grabCmdOptions(line, "pathname contains", &pathconlist)) { rv = 1; }
	}
    
    return rv;
}

struct _indexing_data_source_def FileSystemIndexingDataSource = {
  "File-System",
  "fs",
  fs_indexpath,
  fs_vgetc,
  fs_vsize,
  fs_vtell,
  fs_vseek,
  fs_parseconfline
};
