# coding=utf-8
# 
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


"""
@file simple_logistic.py_in

@brief Logistic Regression: Driver functions

@namespace simple_logistic

@brief Logistic Regression: Driver functions
"""
import plpy
from utilities.validate_args import table_exists
from utilities.validate_args import table_is_empty
# ------------------------------------------------------------------------


def logregr_simple_train(
        schema_madlib, source_table, out_table, dependent_varname,
        independent_varname, max_iter=None,
        tolerance=None, verbose=None, **kwargs):
    """
    Train logistic model

    @param schema_madlib Name of the MADlib schema, properly escaped/quoted
    @param source_table Name of relation containing the training data
    @param out_table Name of relation where model will be outputted
    @param dependent_varname Name of dependent column in training data (of type BOOLEAN)
    @param independent_varname Name of independent column in training data (of type
                   DOUBLE PRECISION[])
    @param max_iter The maximum number of iterations that are allowed.
    @param tolerance The precision that the results should have
    @param kwargs We allow the caller to specify additional arguments (all of
           which will be ignored though). The purpose of this is to allow the
           caller to unpack a dictionary whose element set is a superset of
           the required arguments by this function.

    @return A composite value which is __logregr_simple_result defined in simple_logistic.sql_in
    """

    return __logregr_train_compute(
        schema_madlib, source_table, out_table, dependent_varname,
        independent_varname, max_iter, tolerance,
        verbose, **kwargs)

# ========================================================================


def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
                            ind_col, max_iter, tolerance, verbose, **kwargs):
    """
    Validate the arguments
    """
    if tbl_source is None or tbl_source.strip().lower() in ('null', ''):
        plpy.error("Logregr error: Invalid data table name!")
    if not table_exists(tbl_source):
        plpy.error("Logregr error: Data table does not exist!")
    if table_is_empty(tbl_source):
        plpy.error("Logregr error: Data table is empty!")

    if tbl_output is None or tbl_output.strip().lower() in ('null', ''):
        plpy.error("Logregr error: Invalid output table name!")

    if not dep_col or dep_col.strip().lower() in ('null', ''):
        plpy.error("Logregr error: Invalid dependent column name!")

    # if not columns_exist_in_table(tbl_source, [dep_col]):
    #     plpy.error("Logregr error: Dependent column does not exist!")

    if not ind_col or ind_col.lower() in ('null', ''):
        plpy.error("Logregr error: Invalid independent column name!")

    if max_iter <= 0:
        plpy.error("Logregr error: Maximum number of iterations must be positive!")

    if tolerance < 0:
        plpy.error("Logregr error: The tolerance cannot be negative!")

    update_plan = plpy.prepare(
        """
        SELECT
            {schema_madlib}.__logregr_simple_step(
                ({dep_col})::boolean,
                ({ind_col})::double precision[],
                ($1))
        FROM {tbl_source}
        """.format(
            tbl_output=tbl_output,
            schema_madlib=schema_madlib,
            dep_col=dep_col,
            ind_col=ind_col,
            tbl_source=tbl_source), ["double precision[]"])

    state = None
    for it in range(0, max_iter):
        res_tuple = plpy.execute(update_plan, [state])
        state = res_tuple[0].values()[0]

    output_table = plpy.prepare(
            """
            drop table if exists {tbl_output};
            create table {tbl_output} as
                select
                    (result).coef as coef,
                    (result).log_likelihood as log_likelihood
                from
                (
                    select * from
                    {schema_madlib}.__logregr_simple_finalizer($1)
                ) result
            """.format(schema_madlib=schema_madlib,
                       tbl_output=tbl_output), ["double precision[]"])

    plpy.execute(output_table, [state])
    return None
# --------------------------------------------------------------------


def logregr_simple_help_msg(schema_madlib, message, **kwargs):
    """ Help message for logistic regression

    @param message A string, the help message indicator

    Returns:
      A string, contains the help message
    """
    if not message:

        help_string = """
----------------------------------------------------------------
                        SUMMARY
----------------------------------------------------------------
Binomial logistic regression models the relationship between a
dichotomous dependent variable and one or more predictor variables.

The dependent variable may be a Boolean value or a categorical variable
that can be represented with a Boolean expression.

For more details on function usage:
    SELECT {schema_madlib}.logregr_train('usage')

For a small example on using the function:
    SELECT {schema_madlib}.logregr_train('example')
        """
    elif message in ['usage', 'help', '?']:

        help_string = """
------------------------------------------------------------------
                        USAGE
------------------------------------------------------------------
SELECT {schema_madlib}.logregr_train(
    source_table,         -- name of input table
    out_table,            -- name of output table
    dependent_varname,    -- name of dependent variable
    independent_varname,  -- names of independent variables
    max_iter,             -- optional, default 20, maximum iteration number
    tolerance,            -- optional, default 0.0001, the stopping threshold
    verbose               -- optional, default FALSE, whether to print useful info
);

------------------------------------------------------------------
                        OUTPUT
------------------------------------------------------------------
The output table ('out_table' above) has the following columns:
    <...>,                                          -- Grouping column values used during training
    'coef',                     double precision[], -- vector of fitting coefficients
    'log_likelihood',           double precision,   -- log likelihood
    'std_err',                  double precision[], -- vector of standard errors of the fitting coefficients
    'z_stats',                  double precision[], -- vector of the z-statistics of the coefficients
    'p_values',                 double precision[], -- vector of the p values
    'odds_ratios',              double precision[], -- vector of odds ratios, exp(coefficients)
    'condition_no',             double precision,   -- the condition number
    'num_rows_processed',       integer,            -- how many rows are actually used in the computation
    'num_missing_rows_skipped', integer,            -- number of rows that contain NULL and were skipped per group
    'num_iterations'            double precision    -- how many iterations are used in the computation per group
        """
    elif message in ['example', 'examples']:

        help_string = """
CREATE TABLE patients( id INTEGER NOT NULL,
                       second_attack BOOLEAN,
                       treatment INTEGER,
                       trait_anxiety INTEGER);
COPY patients FROM STDIN WITH DELIMITER '|';
  1 |   True  |         1 |            70
  3 |   True  |         1 |            50
  5 |   True  |         0 |            40
  7 |   True  |         0 |            75
  9 |   True  |         0 |            70
 11 |   False |         1 |            65
 13 |   False |         1 |            45
 15 |   False |         1 |            40
 17 |   False |         0 |            55
 19 |   False |         0 |            50
  2 |   True  |         1 |            80
  4 |   True  |         0 |            60
  6 |   True  |         0 |            65
  8 |   True  |         0 |            80
 10 |   True  |         0 |            60
 12 |   False |         1 |            50
 14 |   False |         1 |            35
 16 |   False |         1 |            50
 18 |   False |         0 |            45
 20 |   False |         0 |            60
\.

-- Drop output tables before calling the function
DROP TABLE IF EXISTS patients_logregr;
DROP TABLE IF EXISTS patients_logregr_summary;

SELECT madlib.logregr_simple_train( 'patients',
                             'patients_logregr',
                             'second_attack',
                             'ARRAY[1, treatment, trait_anxiety]');

SELECT * from patients_logregr;
        """
    else:
        help_string = "No such option. Use {schema_madlib}.logregr_simple_train('help')"

    return help_string.format(schema_madlib=schema_madlib)