#!/usr/bin/python3
import sys
import os

# Allowed letters in Sicilian
LETTERS = set('qertuiopasdfghjlzcvbnmàèìòù')


def get_from_dir(dirname: str) -> set[str]:
    '''
    Load all the files in the directory and return a
    set of all their lines
    '''
    r = set()
    for i in os.listdir(dirname):
        with open(f'{dirname}/{i}', 'rt') as f:
            r = r.union({j.strip() for j in f})
    return r


def word_yielder(fname: str):
    '''
    Load the wikipedia dump file, split by
    word, exclude all words with non-sicilian letters

    then exclude also the words from the exclusion sets
    '''
    exclusions = get_from_dir('exclude')
    with open(fname, 'rt') as f:
        for line in f:
            for i in line.split(' '):
                i = i.strip()

                if len(i) == 0:
                    continue
                if set(i).difference(LETTERS):
                    continue
                if i in exclusions:
                    continue

                yield i


def word_grouper(yielder) -> dict[str, int]:
    '''
    Iterate over strings and count how many times they
    occurr
    '''
    r = {}
    for i in yielder:
        if i not in r:
            r[i] = 0
        r[i] += 1
    return r


def group_sorter(d: dict[str, int]):
    # Put the words and exclude too short and uncommon stuff
    return (i[0] for i in d.items() if i[1] >= 3)


def main():
    words = word_yielder(sys.argv[1])
    groups = word_grouper(words)
    wlist = list(group_sorter(groups))
    wlist.sort()
    print('\n'.join(wlist))


if __name__ == '__main__':
    main()
