/*
 * Decompiled with CFR 0.152.
 */
package rl;

import rl.MarkovDecisionProcess;
import rl.Policy;
import rl.PolicyLearner;

public class ValueIteration
implements PolicyLearner {
    private double gamma;
    private MarkovDecisionProcess process;
    private double[] values;

    public ValueIteration(double gamma, MarkovDecisionProcess process) {
        this.gamma = gamma;
        this.process = process;
        this.values = new double[process.getStateCount()];
        int i = 0;
        while (i < process.getStateCount()) {
            double maxActionVal = process.reward(i, 0);
            int a = 1;
            while (a < process.getActionCount()) {
                maxActionVal = Math.max(maxActionVal, process.reward(i, a));
                ++a;
            }
            this.values[i] = maxActionVal;
            ++i;
        }
    }

    public double train() {
        int stateCount = this.process.getStateCount();
        int actionCount = this.process.getActionCount();
        double difference = 0.0;
        int i = 0;
        while (i < stateCount) {
            double maxActionVal = 0.0;
            int maxAction = 0;
            int a = 0;
            while (a < actionCount) {
                double actionVal = 0.0;
                int j = 0;
                while (j < stateCount) {
                    actionVal += this.process.transitionProbability(i, j, a) * this.values[j];
                    ++j;
                }
                actionVal = this.process.reward(i, a) + this.gamma * actionVal;
                if (actionVal > maxActionVal) {
                    maxActionVal = actionVal;
                    maxAction = a;
                }
                ++a;
            }
            difference = Math.max(Math.abs(this.values[i] - maxActionVal), difference);
            this.values[i] = maxActionVal;
            ++i;
        }
        return difference;
    }

    public Policy getPolicy() {
        int stateCount = this.process.getStateCount();
        int actionCount = this.process.getActionCount();
        int[] policy = new int[stateCount];
        int i = 0;
        while (i < stateCount) {
            double maxActionVal = 0.0;
            int maxAction = 0;
            int a = 0;
            while (a < actionCount) {
                double actionVal = 0.0;
                int j = 0;
                while (j < stateCount) {
                    actionVal += this.process.transitionProbability(i, j, a) * this.values[j];
                    ++j;
                }
                actionVal = this.process.reward(i, a) + this.gamma * actionVal;
                if (actionVal > maxActionVal) {
                    maxActionVal = actionVal;
                    maxAction = a;
                }
                ++a;
            }
            policy[i] = maxAction;
            ++i;
        }
        return new Policy(policy);
    }
}

