%%TODO make functions
%%TODO remove hardcoding
%%TODO reduce number of for loops, use cellfun?
%%TODO disallow invalids
clear;

%%GE in MATLAB is GEM

%%Parse File

GRAMMAR_FILE = 'grammars/sr.bnf';
fid = fopen(GRAMMAR_FILE,'r');
lines = {};
cnt = 1;
while 1
    line = fgetl(fid);
    if ~ischar(line)
        break
    end
    lines(cnt) = cellstr(line);
    cnt = cnt + 1;
end
fclose(fid);
lines = strtrim(lines);

%TODO allow rules to be on separate lines and comments at the end of lines
NON_TERMINAL_PATTERN = '(<.+?>)';
RULE_SEPARATOR = '::=';
PRODUCTION_SEPARATOR = '\|';
COMMENT = '#';
%%Grammar struct, formal setup, <T, NT, R, S>
%%T is a cell of strings
%%NT is a cell of strings
%%R is a Cell
%%S is a string
cfg.rules = {}; % rules
cfg.s_T = {}; % Terminals
cfg.s_NT = {}; % Non-Terminals
cfg.s = ''; % Start symbol
%Parse the lines
for i = 1:1:size(lines, 2)
    line = lines{i};
    if ~isempty(line) && ~strncmp(line, COMMENT, 1)
        %Find lhs and rhs
        splits = strtrim(regexp(line, RULE_SEPARATOR, 'split'));
        %TODO rhs rules on multiple lines        
        lhs = splits{1};
        if regexp(lhs, NON_TERMINAL_PATTERN)
            if any(strcmp(cfg.s_NT, lhs))
                error('lhs %s must be unique',lhs);
            else
                if isempty(cfg.s_NT)
                    cfg.s_NT = cellstr(lhs);
                else
                    cfg.s_NT = [cfg.s_NT{:} cellstr(lhs)];
                end
            end
        else
            error('LHS %s must be non-terminal',lhs);
        end
        splits = strtrim(regexp(splits{2}, PRODUCTION_SEPARATOR, 'split'));
        rhs = {};
        %Find productions
        for j = 1:1:length(splits)
            tmp_production = {};
            production = splits{j};
            %No non-terminals
            if ~regexp(production, NON_TERMINAL_PATTERN)
                cfg.s_T = [cfg.s_T{:} production]
                tmp_production = [tmp_production{:} production];
            else
                % Match non terminal or terminal pattern
                % TODO does not handle quoted NT symbols Does not use the
                % names
                [symbols na] = regexp(production, '(?<NT><.+?>)|(?<T>[^<>]*)', 'tokens', 'names');
                symbols = strtrim(symbols);
                for k = 1:1:length(symbols)
                    symbol = symbols{k};
                    if ~strncmp(symbol, '<', 1)
                        cfg.s_T = [cfg.s_T symbol];
                    end
                    tmp_production = [tmp_production{:} symbol];
                end
            end            
            rhs = {rhs{:} tmp_production};
        end
        cfg.rules = {cfg.rules{:} rhs};
    end
end
cfg.s_T = unique(cfg.s_T);
cfg.s_NT = unique(cfg.s_NT);
cfg.s = cfg.s_NT{1}; 

if size(cfg.s_NT) ~= size(cfg.rules)
    error('NTs must equal rules in size');
end
cfg

x = -1:0.1:1;
f_x = x.^6 + x.^5 + x.^4 + x.^3 + x.^2 + x;

MAX_WRAPS = 2;
CODON_SIZE = 128;
POPULATION_SIZE = 100;
INITIAL_CHROMOSOME_SIZE = 50;
INPUT_COULMN = 1;
OUTPUT_COLUMN = 2;
FITNESS_COLUMN = 3;
USED_INPUT_COLUMN = 4;
%TODO ugly hack for ind size in cells
IND_SIZE = 4;
GENERATIONS = 2;
TOURNAMENT_SIZE = 3;
CROSSOVER_PROBABILITY = 0.7;
MUTATION_PROBABILITY = 0.01;
ELITE_SIZE = 1;
%Store expressions and tinesses that have already been calculated TODO
%preassign size of fitness store and keep track of size
fitness_store = {};
%Set rand seed
%rand('seed',0)
%%Initialize population

%Create the inputs
inputs = randi(CODON_SIZE, POPULATION_SIZE, INITIAL_CHROMOSOME_SIZE);
%TODO Preallocate
pop = cell(POPULATION_SIZE, IND_SIZE);
%Create the individuals
for i = 1:1:size(inputs,1)
    ind.input = inputs(i,:);
    ind.output = {};
    ind.fitness = NaN;
    ind.used_input = NaN;
    %TODO make cells imediately?
    pop(i,:) = struct2cell(ind);
end

%%TODO evaluate initial population
for gen = 1:1:GENERATIONS
    tic
    %%Select TODO do not pick same individual for tournament TODO make sure
    %%they are copies
    fitnesses = [pop{:, FITNESS_COLUMN}];
    [fitnesses fitness_index] = sort(fitnesses);
    pop = pop(fitness_index, :);
    %Best individual
    out = '';
    if ~isempty(pop{1,OUTPUT_COLUMN})
        out = sprintf('%s', pop{1,OUTPUT_COLUMN}{:});
    end    
    fprintf('BI %d %s %.5f %d\n', gen, out, pop{1,FITNESS_COLUMN}, pop{1,USED_INPUT_COLUMN})
    %Select the new population	    
    new_pop = cell(size(pop));
    tournament = cell(TOURNAMENT_SIZE, IND_SIZE);
    for i = 1:1:size(pop,1)        
	%Run a tournament     
        for j = 1:1:TOURNAMENT_SIZE
            pick = randi(size(pop,1),1,1);
            ind = pop(pick,:);
            tournament(j,:) = ind;
        end
        fitnesses = [tournament{:, FITNESS_COLUMN}];
        [fitnesses fitness_index] = sort(fitnesses);
        tournament = tournament(fitness_index, :);      
        %Add the winner of the tournament TODO am I making a copy?
        winner = tournament(1,:);
        new_pop(i,:) = winner(:);
    end
    
    %%Crossover single point TODO do not pick same individual for crossover Uneven
    %%population sizes
    xo_pop = cell(size(pop));
    for i = 1:2:size(new_pop, 1)
        %Pick parents
        parent1 = new_pop{randi(size(pop,1),1,1), :};
        parent2 = new_pop{randi(size(pop,1),1,1), :};
        point1 = randi(length(parent1),1,1);
        point2 = randi(length(parent2),1,1);
        if(rand(1) < CROSSOVER_PROBABILITY)
            child1 = [parent1(1:point1) parent2(point2:end)];
            child2 = [parent2(1:point2) point1(point1:end)];
        else
            child1 = parent1;
            child2 = parent2;
        end
	%Create children    
        children = {child1; child2;};
        for j = 1:1:length(children)
            ind = struct();
            ind.input = children{j};
            ind.output = {};
            ind.fitness = NaN;
            ind.used_input = NaN;
	    %Add to the new population TODO do not create a new population
            xo_pop(i+j-1,:) = struct2cell(ind);
        end
    end
    
    %%Mutation uniform integer 
    for i = 1:1:size(xo_pop,1)	
        c_ind = xo_pop(i,:);
        c_chrom = c_ind{INPUT_COULMN};
	%Find which codons are mutated
        mut_v = find(MUTATION_PROBABILITY > rand(1,length(c_chrom)));
	%Set new codon values
        for j = 1:1:length(mut_v)
            c_chrom(mut_v(j)) = randi(CODON_SIZE,1,1);
        end
        xo_pop{i,INPUT_COULMN} = c_chrom;
    
        input = xo_pop{i,INPUT_COULMN};
        %input = [1,1,1,0,0,0,0,0];
        %%Generate: Map input via rules to output. Returns output and used_input
        used_input = 0;
        wraps = 0;
        output = {};
	%Counter for the unexpanded symobls, since we do not remove any    
        unexpanded_symbol_cnt = 1;
        
	%Initialise the unexpanded symbol list
        unexpanded_symbols = {cfg.s};
        while (wraps < MAX_WRAPS) && (length(unexpanded_symbols) >= unexpanded_symbol_cnt)
            % Wrap
            if mod(used_input,length(input)) == 0 && used_input > 0 && length(production_choices) > 1
                wraps = wraps + 1;
            end
            % Expand a production
            current_symbol = unexpanded_symbols{unexpanded_symbol_cnt};
            % Set output if it is a terminal Faster byu checking if the first char is < TODO faster if a vairable for NT is
            % used?
            if strncmp(current_symbol, '<', 1)
	        %Find the rule by the indes in the NTs
                production_choices = cfg.rules{find(strcmp(cfg.s_NT, current_symbol))};
                % Select a production
                current_production = mod(input(mod(used_input, length(input)) + 1), length(production_choices)) + 1;
                % Use an input if there was more then 1 choice
                if length(production_choices) > 1
                    used_input = used_input + 1;
                end
                % Derviation order is left to right(depth-first)
                chosen_production = production_choices{current_production};
	        % Insert the chosen production symbols in the unexpanded
		% symbol list
                if length(unexpanded_symbols) >= (unexpanded_symbol_cnt + 1)
                   unexpanded_symbols = {unexpanded_symbols{1:unexpanded_symbol_cnt} chosen_production{:} unexpanded_symbols{unexpanded_symbol_cnt + 1:end}};
                else
                   unexpanded_symbols = {unexpanded_symbols{1:unexpanded_symbol_cnt} chosen_production{:}};
                end
            else
	        % Add terminal to the output
                output = {output{:} current_symbol};
            end
            unexpanded_symbol_cnt = unexpanded_symbol_cnt + 1;            
        end
        %Not completly expanded
        if length(unexpanded_symbols) >= unexpanded_symbol_cnt
            output = {};
        end
        xo_pop{i,OUTPUT_COLUMN} = output;
        xo_pop{i,USED_INPUT_COLUMN} = used_input;
        
        %% Fitness evaluation        
        
        fitness = NaN;
        if ~isempty(output)
            exp = sprintf('%s',output{:});
            %Store outputs
            fitness_i = [];
            if ~isempty(fitness_store)
                %TODO cell accessing is not nice, but only way it works??
                fs = fitness_store(:);
                fs = fs(1:2:end);
                fitness_i = find(strcmp(fs, exp));
            end
            if isempty(fitness_i)
                s = eval(exp);
                error = s - f_x;
                fitness = mean(error.^2);
                item.expression = exp;
                item.fitness = fitness;                
                fitness_store = [fitness_store struct2cell(item)];
            else                
                fitness = fitness_store{2, fitness_i(1)};
            end
        end
        xo_pop{i,FITNESS_COLUMN} = fitness;
 
    end
    
    %% Replacement
    fitnesses = [pop{:, FITNESS_COLUMN}];
    [fitnesses fitness_index] = sort(fitnesses);
    pop = pop(fitness_index, :);
    elites = pop(1:ELITE_SIZE,:);
    tot_pop = [pop; xo_pop];
    fitnesses = [tot_pop{:, FITNESS_COLUMN}];
    [fitnesses fitness_index] = sort(fitnesses);
    tot_pop = tot_pop(fitness_index, :);
    pop = [tot_pop(1:size(pop, 1) - ELITE_SIZE,:); elites];    
    toc
end
%pop
size(fitness_store)
