/*
	Paul Tero, 2004
	These are my string matching functions. They basically convert a string into an integer which 
	can then be XORed with another integer to see how closely they match. This conversion can
	be done whenever the string is updated and stored in a different database column, so that
	the string matching will be very quick (quicker than LEVENSTHEIN at least).

	The core function is XORC, which does the conversion. It divides the string into syllables and
	then finds the least common letter in each syllable and stores it's value in an integer. The
	32 bit integer contains the values of 6 such letters.
*/

/*
        To include this in MySQL do something like (where the include directory contains MySQL headers):
        gcc -I/home/local/mysql-3.23.44-pc-linux-gnu-i686/include/ -shared -o /usr/lib/xorc.so xorc.c
        mysql> DROP FUNCTION xorc; CREATE FUNCTION xorc RETURNS INT SONAME "xorc.so";
        mysql> SELECT xorc('Air Jamaica');


        gcc -I/home/local/mysql-3.23.44-pc-linux-gnu-i686/include/ -shared -o /usr/lib/xorm.so xorc.c
        mysql> DROP FUNCTION xorm; CREATE FUNCTION xorm RETURNS INT SONAME "xorm.so";
        mysql> SELECT xorm('Air Jamaica', 'Air France');
        mysql> SELECT xorm('Air Jamaica', 89);


        gcc -I/home/local/mysql-3.23.44-pc-linux-gnu-i686/include/ -shared -o /usr/lib/xorp.so xorc.c
        mysql> DROP FUNCTION xorp; CREATE FUNCTION xorp RETURNS INT SONAME "xorp.so";
        mysql> SELECT xorp('Air Jamaica', 'Air France');
        mysql> SELECT xorp('Air Jamaica', 89);

        For command line testing, uncomment the main function below and compile as an executable:
        gcc -I/home/local/mysql-3.23.44-pc-linux-gnu-i686/include/ xorc.c
        ./a.out
*/


/*****************************************************/
/*Function prototypes and libraries needed to compile*/
/*****************************************************/
#include <global.h>
#include <my_sys.h>
#include <mysql.h>
#include <m_ctype.h>
#include <m_string.h>

#include <stdlib.h>
#include <malloc.h>
#include <string.h>



//The following lines are for testing it from the command line
//#include <stdio.h>
//int main(int argc, char **argv) {printf("Answer: %d\n", xorc_internal ("Air Jamaica", 12));}


//Export the xorc, xorm and xorp functions
long long xorm (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error);
long long xorp (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error);
long long xorc (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error);



long long xorm (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) {return xorm_internal (args);}
my_bool xorm_init (UDF_INIT *initid, UDF_ARGS *args, char *message) {
	if (args->arg_count != 2) {strcpy (message,"XORM() AND XORP() require two string or integer arguments"); return 1;}
	if (args->arg_type[0] != STRING_RESULT) args->arg_type[0] = INT_RESULT; //we accept strings or integers, but not null
	if (args->arg_type[1] != STRING_RESULT) args->arg_type[1] = INT_RESULT;
	return 0;
}
int xorm_internal (UDF_ARGS *args) {
	//if we are passed in valid arguements, check for a STRING or INT, or full null arguments return 0
	int first = args->args[0] ? (args->arg_type[0] == STRING_RESULT ? xorc_internal (args->args[0], args->lengths[0]) : *((long long*) args->args[0])) : 0;
	int second = args->args[1] ? (args->arg_type[1] == STRING_RESULT ? xorc_internal (args->args[1], args->lengths[1]) : *((long long*) args->args[1])) : 0;
        return first ^ second;
}




long long xorp (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) {
        long long result = xorm_internal (args);
        if ((result >> 26) > 0) return 0; //0 of 31 bits match
        if ((result >> 21) > 0) return 16; //at least 5 of 31 bits match
        if ((result >> 16) > 0) return 32; //10 of 31
        if ((result >> 11) > 0) return 48; //15 of 31
        if ((result >> 6) > 0) return 65; //20 of 31
        if ((result >> 1) > 0) return 81; //25 of 31
        if (result > 0) return 97; //30 of 31
        return 100; //31 of 31
}
my_bool xorp_init (UDF_INIT *initid, UDF_ARGS *args, char *message) {xorm_init (initid, args, message);}




long long xorc (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) {
	if (!args->args[0]) return 0; //we don't accept null arguments (even with the type coercion in xorc_init, we still get nulls here in joins)
	return (long long) xorc_internal (args->args[0], args->lengths[0]);
}
my_bool xorc_init (UDF_INIT *initid, UDF_ARGS *args, char *message) {
	if (args->arg_count != 1) {strcpy (message,"XORC() requires one string argument"); return 1;}
	args->arg_type[0] = STRING_RESULT; //we only accept strings (this forces the argument to be a string)
	return 0;
}
int xorc_internal (char *phrase, int phraselen) {
        int isvowel = 0; //whether the current character is a vowel
        int numvowels = 0; //number of vowels found in a row
        int maxvalue = 0; //the current maximum value consonant found
        int shift = 26; //the first one is shifted this far to the right
        int result = 0; //the result
        int i; //this will be used to loop though each character
        int c; //the numberical value of the current characater
        int frequencies[31] = {2, 19, 11, 10, 0, 14, 16, 8, 4, 23, 21, 9, 13, 5, 3, 15, 24, 7, 6, 1, 12, 20, 17, 22, 18, 25, 26, 27, 28, 29, 30}; //frequency of each letter (a is the 2nd most frequent, b the 19th, etc, the last 5 are for 0/1, 2/3, etc)
        for (i=0; i<phraselen; i++) { //loop through each caracter in the string
                c = (int) (phrase[i]); //get it's ordinal value
                if (c < 33 && maxvalue == 0) continue; //a space and we haven't even started yet
                if (c >= 97 && c <= 122) c-=32; //make it into a lower case character
                if (c >= 48 && c <= 57) c = c/2 + 67; //numbers come after all the letters
                isvowel = (c==65 || c==69 || c==73 || c==79 || c==85); //is it a vowel
                //printf ("%dth character is %d, vowel: %d\n", i, c, isvowel);
                if (c < 33 || (numvowels && !isvowel)) { //end of a word
                        //printf ("end of syllable, maxvalue is %d, numvowels is %d\n", maxvalue, numvowels);
                        result |= maxvalue << shift; //add to the result
                        shift -= 5; //shift a bit less the next time
                        numvowels = maxvalue = 0; //reset variables
                        if (shift < 0) break; //leave the loop when we've got enough data
                }
                if (c >= 65 && c <= 95) { //the letter is a valid one (just skip other ones)
                        if (isvowel) numvowels++; //increment the number of vowels in a row
                        maxvalue = max (maxvalue, frequencies[c-65]); //the max value of this syllable
                        //printf ("max value is now %d\n", maxvalue);
                }
        }
        result |= (shift < 0) ? 1 : (maxvalue << shift); //set the last bit to 1 if we've run out of bits, or add the final character
        return result; //return the result
}
