#!/bin/bash
# Created by Ben Okopnik on Sat Jan  3 00:50:54 EST 2009
# Some ideas from Karl Vogel's Hyperestraier article
# (http://linuxgazette.net/158/vogel.html)

# Maximum file size in MB; adjust this to your preferences 
MAX_FILE_SIZE=3

dir="$(pwd)"
db="$dir/search${dir##*/}"

# Default options for "gather":
#	-cl: Regions of overwritten documents will be cleaned up
#	-ft: Files will be treated as plain text
#	-bc: Binary files will be detected and ignored
#	-sd: Modification date of each file will be recorded as an attribute
#	-cm: Documents whose modification date has not changed will be ignored
#	-lf N: Ignore any documents larger than N megabytes
gather_opts="-cl -ft -bc -sd -cm -lf $MAX_FILE_SIZE"

# Define file extensions to ignore; this saves us time, since we don't need
# to run "file" over them. This list does not include "questionable"
# filetypes (i.e., DOC, PDF, etc.) that you may want to delegate and index later.
ignore="$db|\.(gif|jpg|png|xcf|gz|tgz|tbz|p[pb]m|tiff?|mp[23g]|mpeg|wav|midi?|sid|au|r[am]|[au]law|xbm|pag|dir|swp|idx|psd|xls|sxw|zip|pgm|wm[av]|eps|swf|aux|bbl|idx|tex|raw|od[st])$"

# Here comes the big pipe!
/bin/echo "========= Searching for indexable content ============="

# If there's no EXCLUDE file, create one that just excludes itself
# (needed by the 'egrep -ivf EXCLUDE' filter.)
[ ! -f "$dir/EXCLUDE" ] && echo '/EXCLUDE$' > "$dir/EXCLUDE"

# Ignore the Hyperestraier index and any empty or "non-regular" files
/usr/bin/find . -wholename "$db" -prune -o -type f -size +0|\
	# Generate 'file' output for each file
	/usr/bin/xargs -d '\n' -I '{}' -s 1000 file -F '///' '{}'|\
	# Omit these specified filetypes (false negatives)
	/bin/egrep -iv '///.*(latex|rich)'|\
	# Omit everything _except_ these filetypes (positive matches)
	/bin/sed -n 's#^\(.*\)///.*\(text\|xml\|pod_doc\).*$#\1#;T;p'|\
	# Exclude any filenames that match patterns in the 'EXCLUDE' file
	/bin/egrep -ivf './EXCLUDE'|\
	# Exclude files that match the 'ignore' pattern
	/bin/egrep -iv "$ignore"|\
	# Index the remaining results
	/usr/bin/estcmd gather $gather_opts "$db" -

# Remove the 'spurious' EXCLUDE file
[ "`/bin/cat $dir/EXCLUDE`" = '/EXCLUDE$' ] && rm "$dir/EXCLUDE"

/bin/echo "================== Optimizing... ======================"
/usr/bin/estcmd extkeys "$db"
/bin/sleep 1
/usr/bin/estcmd optimize "$db"
/usr/bin/estcmd purge -cl "$db"
/bin/echo "==================== Finished. ========================"

