package CHARLINT;

$VERSION = '0.46';

use strict;

my %CombClass;
my %DecoKompData;
my %CompCano;
my %DecoCano;
my %exists;

sub Charlint
{
	my %args = @_;
	my $name = $args{Name};
	my $dataFile = $args{DataFile};
	my ($ch,$elem,$res,@line,@line2,$starterPos,$sourceLength);
	my ($composite,$sourcePos,$targetPos,$lastClass,$starterCh,$chClass);
	my %result;

	if(! %CombClass)
	{
		ReadStoredData($dataFile);
	}

	if ($res = CheckUTF8($name))
	{
		$result{Error}= "$name: Non-UTF-8 ($res).";
		return \%result;
	}

	if($res = CheckPrivate($name))
	{
		$result{Error}= "$name: Private characters ($res).";
                return \%result;
	}

	@line = splitutf8($name);
    	@line2 = ();

	my $t_res;

	while (defined($elem = shift @line))
	{
		$t_res = DecoKomp($elem);
		if(defined $t_res->{error})
		{
			$result{Error} = $t_res->{error};
			return \%result;
		}
		else
		{
			push @line2, splitutf8($t_res->{result});
		}
	}	

	@line = sortCano(@line2);

	$starterPos   = 0;
	$sourceLength = @line;
	if ($sourceLength == 0) 
	{
                $targetPos = 0;
        }
	else 
	{
                $targetPos = 1;
                $starterCh = $line[0];
                $lastClass = -1;     # this eliminates a special check
        }
	
	for ($sourcePos = 1; $sourcePos < $sourceLength; $sourcePos++)
	{
		$ch = $line[$sourcePos];
		$chClass = $CombClass{$ch};
		$chClass = 0  if (!defined($chClass));
		$composite = CompCano($starterCh, $ch);
		if (defined($composite) && $lastClass < $chClass)
		{
                	$line[$starterPos] = $composite;
                	$starterCh = $composite;
                }
		elsif ($chClass == 0) 
		{
                	$starterPos = $targetPos;
                	$starterCh  = $ch;
                	$lastClass  = -1;
               		$line[$targetPos++] = $ch;
                }
		else 
		{
                	$lastClass = $chClass;
                	$line[$targetPos++] = $ch;
                }
	}	
	$#line = $targetPos-1;

	$result{charlint_domain}= (join "", @line);
        return \%result;
}

sub ReadStoredData 
{
	my ($dataFile) = @_;
	require Storable;    # in line, to not require module if not needed
	my %all_data = %{&Storable::retrieve ($dataFile)};
 
	%exists = %{$all_data{exists}};
	%DecoCano = %{$all_data{DecoCano}};
	%CompCano = %{$all_data{CompCano}};
	%DecoKompData = %{$all_data{DecoKompData}};
	%CombClass = %{$all_data{CombClass}};
}

sub CheckUTF8 
{
    my ($s) = @_;
    my ($st);
    if ($s =~ /[\355][\240-\257][\200-\277][\355][\260-\277][\200-\277]/) {
        return "surrogate pair";
    }
    if ($s =~ /[\355][\240-\277][\200-\277]/) {
        return "single surrogate";
    }
    if ($s =~ /[\300-\301][\200-\277]/) {
        return "ASCII in 2 bytes";
    }
    if ($s =~ /[\340][\200-\237][\200-\277]/) {
        return "3 bytes instead of 2 or less";
    }
    if ($s =~ /[\360][\200-\217][\200-\277]{2}/) {
        return "4 bytes instead of 3 or less";
    }
    if ($s =~ /[\370][\200-\207][\200-\277]{3}/) {
        return "5 bytes instead of 4 or less";
    }
    if ($s =~ /[\374][\200-\203][\200-\277]{4}/) {
        return "6 bytes instead of 5 or less";
    }
    # non-synchronized cases
    $s =~ s{   [\000-\177]
             | [\300-\337][\200-\277]
             | [\340-\357][\200-\277]{2}
             | [\360-\367][\200-\277]{3}
             | [\370-\373][\200-\277]{4}
             | [\374-\375][\200-\277]{5}
           }{}gx;
    # forbidden bytes
    if ($s =~ /[\376\377]/) {
        return "0xFE or 0xFF byte";
    }
    if ($s ne "") {
        return "synchronization problem";
    }
    return undef;
}

sub CheckPrivate 
{
    my ($s) = @_;
    if ($s =~ /[\356][\200-\277]{2}|[\357][\200-\237][\200-\277]/) {
        return "BMP";
    }
    if ($s =~ /[\363][\260-\277][\200-\277]{2}/) {
        return "plane 15";
    }
    if ($s =~ /[\364][\200-\217][\200-\277]{2}/) {
        return "plane 16";
    }
    return undef;
}

sub splitutf8 {
    my $in = shift;
    my @arr = split (/(?=[\000-\177\300-\377])/, $in); 
    return @arr;
}

sub sortCano {
    my @a = @_;
    my ($i, $ccHere, $ccPrev, $temp);
 
    return @a  if (@a <= 1);
    for ($i=1; $i < @a; $i++) {
        $ccHere = $CombClass{$a[$i]};
        $ccPrev = $CombClass{$a[$i-1]};
        $ccHere = 0  if (!defined($ccHere));
        $ccPrev = 0  if (!defined($ccPrev));
        if ($ccHere != 0  &&  $ccPrev > $ccHere) {
            $temp    = $a[$i];     # exchange
            $a[$i]   = $a[$i-1];
            $a[$i-1] = $temp;
            $i -= 2  if ($i > 1);  # backtrack and check again
        }
    }
    return @a;
}

sub DecoKomp {
    my ($s) = @_;
    my (%result,$h);
    my $hash = utf82num($s);
    if(defined $hash->{error})
    {
	$result{error} = $hash->{error};
	return \%result;
    }
    else
    {
	$h = $hash->{result};
    }
    if ($h >= 0xAC00 && $h < 0xD7A4) {
                $result{result} =  DecoCano($s);  #refer to DecoCano for Hangul decomposition
		return \%result;
    }
    else {
                $result{result}  = $DecoKompData{$s};
		if (defined $result{result})
		{
			return \%result;
		}
		else
		{
			$result{result} = $s;
			return \%result;
		}
    }
}

sub utf82num {
    my(@t, $t, %result, $trail);
 
    @t = unpack ("C*", $_[0]);
    $t = shift (@t);
    if    ($t<0x80) { $result{result} = $t       ; $trail=0; }
    elsif ($t<0xC0) { $result{error} = "Illegal leading byte in UTF-8."; return \%result;}
    elsif ($t<0xE0) { $result{result} = $t & 0x1F; $trail=1; }
    elsif ($t<0xF0) { $result{result} = $t & 0x0F; $trail=2; }
    elsif ($t<0xF8) { $result{result} = $t & 0x07; $trail=3; }
    elsif ($t<0xFC) { $result{result} = $t & 0x03; $trail=4; }
    elsif ($t<0xFE) { $result{result} = $t & 0x01; $trail=5; }
    else            { $result{error} = "Illegal byte in UTF-8.";  return \%result;}
 
    if ($trail != $#t + 1) { $result{error} =  "Not right number of trailing bytes."; return \%result; }
    while ($trail--) {
        # maybe check for 01xxxxxx
        $result{result} <<= 6;
        $result{result} += 0x3F & shift (@t);
    }
    return \%result;
}

sub CompCano {
    my ($starterCh, $ch) = @_;
    my $s = utf82num($starterCh);
    my $c = utf82num($ch);
    if ($s >= 0x1100 && $s < 0x1113 && $c >= 0x1161 && $c < 0x1176) {
        return num2utf8((($s-0x1100)*21+$c-0x1161) * 28 + 0xAC00);
    }
    elsif ($s >= 0xAC00 && $s < 0xD7A4 && !(($s-0xAC00)%28) && $c >= 0x11A8 && $c < 0x11C3) {
        return num2utf8($s + $c - 0x11A7);
    }
    else {
        return $CompCano{join "", ($starterCh, $ch)};
    }
} # end CompCano

sub num2utf8 {
    my ($t) = @_;
    my ($trail, $firstbits, @result);
 
    if    ($t<0x00000080) { $firstbits=0x00; $trail=0; }
    elsif ($t<0x00000800) { $firstbits=0xC0; $trail=1; }
    elsif ($t<0x00010000) { $firstbits=0xE0; $trail=2; }
    elsif ($t<0x00200000) { $firstbits=0xF0; $trail=3; }
    elsif ($t<0x04000000) { $firstbits=0xF8; $trail=4; }
    elsif ($t<0x80000000) { $firstbits=0xFC; $trail=5; }
    else {
        die "Too large scalar value, cannot be converted to UTF-8.\n";
    }
    for (1 .. $trail) {
        unshift (@result, ($t & 0x3F) | 0x80);
        $t >>= 6;         # slight danger of non-portability
    }
    unshift (@result, $t | $firstbits);
    pack ("C*", @result);
}

sub DecoCano {
    my ($s) = @_;
    my $h = utf82num($s);
    if ($h >= 0xAC00 && $h < 0xD7A4) {
        my $hindex = $h - 0xAC00;
        my $l = 0x1100 + $hindex/(21*28);
        my $v = 0x1161 + ($hindex % (21*28)) / 28;
        my $t = $hindex % 28;
        if ($t) {
            return join "", num2utf8($l), num2utf8($v), num2utf8(0x11A7 + $t);
        }
        else {
            return join "", num2utf8($l), num2utf8($v);
        }
    }
    else {
                my $r = $DecoCano{$s};
        return $r if defined $r;
                return $s;
    }
}

return 1;
