function cfg = get_grammar(lines)
% Parse a BNF grammar
    
% CFG = GET_GRAMMAR(LINES) parses the LINES and returns a grammar struct,
% formal setup, the context free grammar(cfg) tuple <T, NT, R, S>, T is a
% cell of strings, NT is a cell of strings, R is a Cell, S is a string.
    
% LINES a cell containing the lines of a BNF grammar
    
% TODO allow rules to be on separate lines and comments at the end of lines
% TODO better data structures for cfg
% TODO rhs rules on multiple lines
% TODO checks for correctness of bad parses
% TODO does not handle quoted NT symbols Does not use the names
% TODO automatically infer recursive rules
% TODO automatically infer min depth
% TODO automatically infer conserving rules
% TODO does not handle recursive rules which span two rules, e.g. <A> ::=
% <B>; <B> ::= <A>
    
NON_TERMINAL_PATTERN = '(<.+?>)';
RULE_SEPARATOR = '::=';
PRODUCTION_SEPARATOR = '\|';
COMMENT = '#';

cfg.rules = {}; % rules
cfg.s_T = {}; % Terminals
cfg.s_NT = {}; % Non-Terminals
cfg.s = ''; % Start symbol

%% Parse the lines
for i = 1:1:size(lines, 2)
    line = lines{i};
    if ~isempty(line) && ~strncmp(line, COMMENT, 1)
        %% Find lhs and rhs
        splits = strtrim(regexp(line, RULE_SEPARATOR, 'split'));
        lhs = splits{1};
        if regexp(lhs, NON_TERMINAL_PATTERN)
            if any(strcmp(cfg.s_NT, lhs))
                error('lhs %s must be unique',lhs);
            else
                if isempty(cfg.s_NT)
                    cfg.s_NT = cellstr(lhs);
                else
                    cfg.s_NT = [cfg.s_NT{:} cellstr(lhs)];
                end
            end
        else
            error('LHS %s must be non-terminal',lhs);
        end
        splits_prod = strtrim(regexp(splits{2}, PRODUCTION_SEPARATOR, 'split'));
        rhs = {};
        %Version 7.4 does not capture rules with one production
        if isempty(splits_prod) && ~isempty(strtrim(splits{2}))
            splits_prod = splits(2);
        end
        %% Find productions
        for j = 1:1:length(splits_prod)
            tmp_production = {};
            production = splits_prod{j};
            %% No non-terminals
            if ~regexp(production, NON_TERMINAL_PATTERN)
                cfg.s_T = [cfg.s_T{:} production];
                tmp_production = [tmp_production{:} production];
            else
                %% Match non terminal or terminal pattern
                [symbols na] = regexp(production, '(?<NT><.+?>)|(?<T>[^<>]*)', 'tokens', 'names');
                % Whitespaces in the grammar are preserved
                for k = 1:1:length(symbols)
                    symbol = symbols{k};
                    if ~strncmp(symbol, '<', 1)
                        cfg.s_T = [cfg.s_T symbol];
                    end
                    tmp_production = [tmp_production{:} symbol];
                end
            end
            rhs = {rhs{:} tmp_production};
        end
        cfg.rules = {cfg.rules{:} rhs};
    end
end

cfg.s_T = unique(cfg.s_T);
cfg.s = cfg.s_NT{1};

cfg.recursive_rules = get_recursive_rules(cfg.rules, cfg.s_NT);
cfg.min_depths = get_min_depths(cfg.rules, cfg.s_NT);
%Start symbol min_dept
cfg.min_depth = cfg.min_depths(1);

cfg.conserving_rules_NT = get_conserving_rules(cfg.rules, cfg.s_NT, cfg.recursive_rules);
% Set unique production choice ids
cfg.unique_production_choice_ids = cfg.rules;
cfg.conserving_rules = [];
cnt = 1;
for i = 1:length(cfg.rules)
    for j = 1:length(cfg.rules{1,i})
        cfg.unique_production_choice_ids{1,i}{j} = cnt;
        cnt = cnt + 1;
        if any(cfg.conserving_rules_NT == i)
            cfg.conserving_rules = [cfg.conserving_rules cnt]; 
        end
    end
end
       
if size(cfg.s_NT) ~= size(cfg.rules)
    error('NTs must equal rules in size');
end


%% GET_CONSERVING_RULES
function conserving_rules = get_conserving_rules(rules, NTs, recursive_rules)
% Find the rules which has the same size for all possible subtrees
% CONSERVING_RULES = GET_CONSERVING_RULES(RULES, NTS, RECURSIVE_RULES) Find
% the RULES which has the same size for all possible subtrees
    
conserving_rules = [];
for i = 1:size(rules,2)
    rule = NTs{i};
    c_rule = get_conserving_rule(rule, rules, NTs, recursive_rules);
    if c_rule > -1 && ~isinf(c_rule)
        conserving_rules = [conserving_rules i];
    end
end
   
%% GET_CONSERVING_RULE
function c_rule_steps = get_conserving_rule(rule, rules, NTs, recursive_rules)

rule_index = find(strcmp(NTs, rule));
production_choices = rules{rule_index};          
c_rule_steps = -1;
c_symbols = {};
n_symbols = {};
        
for i = 1:size(production_choices,2)
    production = production_choices{i};
    %Find first production non-terimnals and size
    if isempty(c_symbols)
        for j = 1:size(production,2)        
            current_symbol = production{j};
            if strncmp(current_symbol, '<', 1)
                c_rule_steps = c_rule_steps + ...
                    depth_first_expansion(current_symbol, rules, NTs, 1, recursive_rules);
                c_symbols = {c_symbols{:} current_symbol};
            end
        end
    else
        n_symbols = {};
        rule_steps = 0;
        for j = 1:size(production,2)        
            current_symbol = production{j};
            if strncmp(current_symbol, '<', 1)
                rule_steps = rule_steps + ...
                    depth_first_expansion(current_symbol, rules, NTs, 1, recursive_rules);
                n_symbols = {n_symbols{:} current_symbol};
            end
        end
    end
    if i > 1 && ~isempty(c_symbols)
        if ~all(size(c_symbols) == size(n_symbols)) 
            c_rule_steps = -1;
            break
        elseif ~all(strcmp(c_symbols,n_symbols))
            if isinf(c_rule_steps) || ...
                    isinf(rule_steps) || ...
                    c_rule_steps ~= rule_steps
                c_rule_steps = -1;
                break
            end
        end
    end
end
%All terminals
if isempty(c_symbols) 
    c_rule_steps = 0;
elseif isempty(n_symbols)
    c_rule_steps = -1;
end

%%All recursive non-terminals
if isinf(c_rule_steps) && ~isempty(c_symbols) && ~isempty(n_symbols)
    c_rule_steps = 0;
end        

function step = depth_first_expansion(rule, rules, NTs, step, recursive_rules) 
% Counts the steps in a depth first expansion.
    
% STEP = DEPTH_FIRST_EXPANSION(RULE, RULES, NTS, STEP, RECURSIVE_RULES) Counts the steps
% in a depth first expansion. Returns the number of STEPs. 
% RULE string of expanded rule
% RULES cell of rules
% NTS cell of non-terminals
% STEP integer denoting the current step
% RECURSIVE_RULES cell of recursive rules
    
production_choices = rules{find(strcmp(NTs, rule))};          
for i = 1:size(production_choices,2)
    production = production_choices{i};
    for j = 1:size(production,2)
        current_symbol = production{j};
        if strncmp(current_symbol, '<', 1)
            if ~any(strcmp(recursive_rules, current_symbol))                   
                step = depth_first_expansion(NTs{find(strcmp(NTs, current_symbol))}, ...
                    rules, NTs, step+1, recursive_rules);
            else
                step = inf;
            end
        end
    end
end
    
%% GET_MIN_DEPTHS
function min_depths = get_min_depths(rules, NTs)
% Finds the minimum depths required to terminate a subtree
    
% MIN_DEPTH = GET_MIN_DEPTHS(RULES, NTS) The minimume depths required to
% terminate the derivation of the RULES
min_depths = ones(size(rules));
for i = 1:size(rules,2)    
    min_depths(i) = get_min_depth(NTs{i}, rules, NTs, 1, Inf, {});
end
    return
%% GET_MIN_DEPTH
function [min_depth tmp_depth] = get_min_depth(start_rule, rules, NTs, depth, tmp_depth, visited_rules)
% Finds the minimum depth required to terminate a subtree
    
% MIN_DEPTH = GET_MIN_DEPTH(START_RULE, RULES, NTS, DEPTH,
% VISITED_RULES) The minimum depth required to terminate the derivation of
% the START_RULE
min_depth = tmp_depth;
% Avoid recursive rules    
if ~any(strcmp(start_rule, visited_rules))
    rule_index = find(strcmp(NTs, start_rule));
    production_choices = rules{rule_index};          
    for j = 1:size(production_choices,2)
        production = production_choices{j};
        for k = 1:size(production,2)
            current_symbol = production{k};
            if strncmp(current_symbol, '<', 1)
                rule_index = find(strcmp(NTs, current_symbol));
                visited_rules = {visited_rules{:} start_rule};
                [min_depth tmp_depth] = get_min_depth(NTs{rule_index}, rules, ...
                    NTs, depth + 1, tmp_depth, visited_rules);
            else
                if size(production,2) == 1
                    min_depth = depth;
                end
            end
        end
        if min_depth < tmp_depth
            tmp_depth = min_depth;
        end
    end
end
    
%% GET_RECURSIVE_RULES
function recursive_rules = get_recursive_rules(rules, NTs)
% Finds recursive rules
% RECURSIVE_RULES = GET_RECURSIVE_RULES(RULES, NTs) returns RECURSIVE_RULES
% found in RULES
%
%    RULES cell array of rules
%    NTS cell array of non-terminal symbols matching the rule ordered as
%    the rules in the BNF specification
    
recursive_rules = {};
for i = 1:size(rules,2)
   rule = NTs{i};
   recursive_rule = {};
   unvisited_rules = {};
   for j = 1:size(rules{i},2) % Production
       for k = 1:size(rules{i}{j},2) % Symbols
                                     % TODO do not use only <
           if strncmp(rules{i}{j}{k}, '<', 1)
               if isempty(strcmp(rules{i}{j}{k}, unvisited_rules)) || ...
                   ~any(strcmp(rules{i}{j}{k}, unvisited_rules))
                   unvisited_rules = {unvisited_rules{:} rules{i}{j}{k}};
               end
           end
       end
   end
   visited_rules = {};
   visit_cnt = 1;
   while visit_cnt <= size(unvisited_rules,2) && isempty(recursive_rule)
       current_rule = unvisited_rules{visit_cnt};
       if strcmp(rule, current_rule)
           recursive_rule = rule;
       else                  
           rule_index = find(strcmp(NTs, current_rule));
           production_choices = rules{rule_index};          
           for j = 1:size(production_choices,2)
               production = production_choices{j};
               for k = 1:size(production,2)
                   current_symbol = production{k};
                   if strncmp(current_symbol, '<', 1)
                       if (isempty(strcmp(current_symbol, visited_rules)) || ...
                           ~any(strcmp(current_symbol, visited_rules))) && ...
                           ~any(strcmp(current_symbol, unvisited_rules))
                           unvisited_rules = {unvisited_rules{:} ...
                                            current_symbol};
                       else
                           visited_rules = {visited_rules{:} ...
                                              current_symbol};
                       end
                   end
               end
           end
           visit_cnt = visit_cnt + 1;
       end
   end
   if ~isempty(recursive_rule)
       recursive_rules = {recursive_rules{:} recursive_rule};
   end
end
    
