#!/bin/csh -f

# Compare two directories and make identical files hard linked copies
# of one of the files.

# usage: ~/consolidate /var/tmp/watrous.110922.0959 /var/tmp/watrous

# daw; 9/23/11

switch (`uname -s``uname -r`)
case SunOS5*:
    set ECHO = /usr/ucb/echo
    set TPLUS = "+"
	breaksw
case Linux*:
    set ECHO = //bin/echo
    set TPLUS = "--lines=+"
	breaksw
default:
	exec echo ${0}: does not run on `uname -s` `uname -r`
endsw

while($#argv)
	switch ("$1")
	case -dd:
		set DDEBUG
	case -d:
		set DEBUG = /bin/echo
#		shift
#		breaksw
# fall through to verbose
	case -v:
		set VERBOSE
		shift
		breaksw
	case -df:
		set DF
		shift
		breaksw
	case -dots:
		set DOTS
		shift
		breaksw
	case -s:
		set SUMMARIZE
		shift
		breaksw
	default:
	    if ( -d "$1" && ! $?DIR1) then
		set DIR1 = `echo $1 | sed 's;/$;;'`
		shift
		breaksw
	    else if ( -d "$1" && ! $?DIR2) then
		set DIR2 = `echo $1 | sed 's;/$;;'`
		shift
		breaksw
	    endif
		set DQ = '"'
		exec echo ${0}: "What do I do with $DQ$1$DQ?"
	endsw
end

if (! $?DIR2) exec echo usage: $0 "<directory> <directory>"
if (! $?DEBUG) set DEBUG
set DONE = 0
set SKIPPED = 0

if ($?DF) then
    /bin/df -k $DIR1
endif

if ($?SUMMARIZE) echo `date +%T` Consolidating $DIR1 and $DIR2

set BASE = `basename $0`
set TMP = /tmp/$BASE

if ($?VERBOSE) echo `date +%T` Finding files in $DIR1
find $DIR1 -type f -ls > $TMP.1
if ($?VERBOSE) echo `date +%T` Finding files in $DIR2
find $DIR2 -type f -ls > $TMP.2

if ($?VERBOSE) echo `date +%T` Isolating inodes
#awk '{print $1}' $TMP.1 | sort -n > $TMP.1.inodes
#awk '{print $1}' $TMP.2 | sort -n > $TMP.2.inodes
# Hmmm.  On richelieu, comm complains:		5/14/15
# comm: file 1 is not in sorted order
# comm: file 2 is not in sorted order
awk '{print $1}' $TMP.1 | sort    > $TMP.1.inodes
awk '{print $1}' $TMP.2 | sort    > $TMP.2.inodes

if ($?VERBOSE) echo `date +%T` Finding common inodes
comm -23 $TMP.{1,2}.inodes > $TMP.inodes
if ($?VERBOSE) echo `date +%T` `cat $TMP.inodes | wc -l` inodes found in commmon

@ OFF = 1		# initial offset
@ CHUNK = 1000		# how many inodes

set MAX = `cat $TMP.inodes | wc -l`

while ($OFF <= $MAX)
#	foreach INODE ( `tail +$OFF $TMP.inodes | head -$CHUNK` )
	foreach INODE ( `tail $TPLUS$OFF $TMP.inodes | head -$CHUNK` )
		if ($?DDEBUG) echo $INODE
#		set ORIG = `grep ^$INODE" " $TMP.1 | awk '{print $NF}'`
# Left square brackets trip us up.  Break trying to consolidate them.
#		set ORIG = `grep ^$INODE" " $TMP.1 | awk '{print $NF}' | sed 's;\[;_LSQ_;g'`
# So do asterixes.
#		set ORIG = `grep ^$INODE" " $TMP.1 | awk '{print $NF}' | sed -e 's;\[;_LSQ_;g' -e  's;\*;_ESC_;g'`
# Sometimes leading spaces...  12/15/16
		set ORIG = `grep ^"[ ]*$INODE " $TMP.1 | awk '{print $NF}' | sed -e 's;\[;_LSQ_;g' -e  's;\*;_ESC_;g'`
		set PDUP = `echo "$ORIG" | sed "s;$DIR1;$DIR2;"`
		if (-e $PDUP) then
		    if (-e /usr/local/bin/md5) then
			set OMD5 = `/usr/local/bin/md5 $ORIG | awk '{print $NF}'`
			set PMD5 = `/usr/local/bin/md5 $PDUP | awk '{print $NF}'`
		    else
			set OMD5 = `/usr/bin/md5sum $ORIG | awk '{print $1}'`
			set PMD5 = `/usr/bin/md5sum $PDUP | awk '{print $1}'`
		    endif
		    if ($OMD5 == $PMD5) then
			if ($?DOTS) /bin/echo ""
			if ($?VERBOSE) echo `date +%T` consolidating $ORIG
			$DEBUG /bin/rm $ORIG
			$DEBUG /bin/ln $PDUP $ORIG
			@ DONE++
		    endif
		else
#		    if ($?DOTS) /usr/ucb/echo -n "."
		    if ($?DOTS) $ECHO -n "."
		    @ SKIPPED++
		endif
	end
	@ OFF += $CHUNK
end

if ($?SUMMARIZE) echo `date +%T` $DONE files consolidated\; $SKIPPED skipped

if ($?DF) then
    /bin/df -k $DIR1
endif

/bin/rm -f $TMP*
