#!/usr/bin/python3
# -*- coding: utf-8 -*-

# Copyright © 2015-2016 marmuta <marmvta@gmail.com>
#
# This file is part of Onboard.
#
# Onboard is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Onboard is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import sys
import os
import operator
import optparse
import re

try:
    import pypredict
except ImportError:
    import Onboard.pypredict


def main():

    parser = optparse.OptionParser(usage=
             "Usage: %prog [options] language_id|filename")
    options, args = parser.parse_args()

    if not args:
        print("Language ID, e.g. en_US, or filename of a language model required.", file=sys.stderr)
        sys.exit(1)

    lang_id = args[0]
    if os.path.exists(lang_id) or \
        not re.match("[a-z]{2}_[A-Z]{2}", lang_id):
        fn = lang_id
    else:
        fn = os.path.join("models", lang_id + ".lm")

    if not os.path.exists(fn):
        print("Invalid filename '{}'.".format(fn), file=sys.stderr)
        sys.exit(1)

    model = pypredict.DynamicModel()
    model.load(fn)

    frequencies = {}
    for ng in model.iter_ngrams():
        ngram = ng[0]
        if len(ngram) == 1: # we're only interested in unigrams
            word = ngram[0]
            if not word.startswith("<"):
                for char in word:
                    char = char.lower()
                    frequencies[char] = frequencies.get(char, 0) + 1

    sorted_freqs = list(reversed(sorted(frequencies.items(), key=operator.itemgetter(0))))
    sorted_freqs = list(reversed(sorted(sorted_freqs, key=operator.itemgetter(1))))
    total_letters = sum([f for l, f in sorted_freqs])

    for i, (letter, frequency) in enumerate(sorted_freqs):
        probability = frequency / total_letters
        print("'{}' {:10} {:10.4f}".format(letter, frequency, probability))


if __name__ == '__main__':
    main()




