#!usr/local/bin/perl -w
my @words1 = ();
my @words2 = ();
open(W1, "<Lang1");
#feed in Indonesian data
while(my $line = <W1>) {
chomp $line;
$line =~ s/[ ]+//g;
if($line) {
push(@words1,
$line);
}
}
close W1;
open(W1, "<Lang2"); #feed in
Maori data
while(my $line = <W1>) {
chomp $line;
$line =~ s/[ ]+//g;
if($line) {
push(@words2, $line);
}
}
close W1;
# MAIN
foreach my $word1 (@words1) {
foreach my $word2 (@words2) {
#run compare words
subroutine
on a word from each list
my $result = compare_words($word1,
$word2);
if($result
== 1) {
print "$word1 vs $word2";
print "\t Match\n";
}
#else {
# print "\tNo match\n";
#} #commented out to leave off non-matching sets for
clarity
}
}
sub compare_words {
my($word1, $word2) = @_;
my $max = 0; #max for calculating LCS
my %strings1 = ();
my %strings2 = ();
# define
threshold
so i can print those that meet or exceed later
my $thresh = 0.7;
if(length($word1) >= length($word2)) {
$max =
length($word1);
} #determine which
word has greatest length
else {
$max =
length($word2);
}
#print "Max: $max\n";
for (my $i =
0; $i < length($word1); ++$i) {
for (my $j = 1; $j <=
length($word1);
++$j) {
$strings1{substr($word1,
$i, $j)} = 1;
}
}
for (my $i =
0; $i < length($word2); ++$i) {
for (my $j = 1; $j <=
length($word2);
++$j) {
my $sub = substr($word2, $i, $j);
$strings2{$sub} = 1;
}
#determine which word has greatest length
else {
$max = length($word2);
}
#print "Max: $max\n";
for (my $i =
0; $i < length($word1); ++$i) {
for (my $j = 1; $j <=
length($word1);
++$j) {
$strings1{substr($word1,
$i, $j)} = 1;
}
}
for (my $i =
0; $i < length($word2); ++$i) {
for (my $j = 1; $j <=
length($word2);
++$j) {
my $sub = substr($word2,
$i, $j);
$strings2{$sub} = 1;
#add next two lines for each substitution rule
$sub =~ s/p/b/g;
$strings2{$sub} = 1;
###############
$sub =~ s/r/l/g;
$strings2{$sub} = 1;
###############
$sub =~ s/t/d/g;
$strings2{$sub} = 1;
}
}
#calc longest common sequence a
la Melamed 1999
my $max_lcs = 0;
foreach my $s1 (keys %strings1) {
if($strings2{$s1})
{
my $lcs = length($s1);
if($lcs > $max_lcs) {
$max_lcs = $lcs;
}
#print "\tMatch ".$s1." L: ".length($s1)."\n";
}
}
#print "\nResult: $max_lcs\n";
if(
($max_lcs/$max) >= $thresh) { #take max
LCS divide by no. of longest word
return 1;
}
else {
return 0;
}
}
#It wouldn't be much
trouble to adapt this for other pairs of related languages,
#or if you're slightly adept at modifying
someone else's script, you could identify
#cognates from dozens of languages from any family. Scroll down to the
part about
#substitutions, tweak accordingly and then modify the "longest
common sequence"
#chunk to suit your needs. Many thanks to Dr. McFetridge and Damir for
their help.