#!/usr/bin/pypy3

# find k-mer coordinates in genome, by chr

import os
import re
import sys
import argparse

scriptname=sys.argv[0]

parser=argparse.ArgumentParser(description="argument parser")
parser.add_argument('-f', help='fasta file with genome', required=True, nargs='+' )
parser.add_argument('-k', help='file with k-mers', required=True, type=argparse.FileType('r'))
#parser.add_argument('-n', help='flag', action='store_true', default=False) # this is a flag argument
args=parser.parse_args()

refa = re.compile('>')
kmer = args.k


kmerRCpair = dict()
kmer_list = dict()
kmer_listRC = dict()
kmers = dict()
for l in args.k:
    cols = l.rstrip('\n').split('\t')
    kmer = cols[0]
    
    kmerR  = kmer[::-1]
    kmerRC = kmerR.replace('A','t').replace('T', 'A').replace('G', 'c').replace('C', 'G').replace('c', 'C').replace('t', 'T')

    kmer_list[kmer] = 1
    #kmer_listRC[kmerRC] = 1
    
k = len(kmer)

cc=0
for file in args.f:
    #sys.stderr.write(file)
    done = dict()
    donerc = dict()
    tmp = file.split('/')
    chrr = tmp[len(tmp)-1].replace('.fa', '')
    seq_lines = []
    seq = ''
    ln = 0
    for line in open(file, 'r'):
        if ln % 100000 == 0:
            sys.stderr.write(''+str(ln)+'\r')
        ln += 1
        l = line.rstrip('\n')
        if not refa.match(line):
            seq_lines.append(l.upper())
    seq = ''.join(seq_lines)

    sys.stderr.write('\nFinding kmers in '+file+'\n')
    for i in range(0, len(seq)):
        kmer = seq[i:i+k]

        if kmer in kmer_list:
            print(chrr+'\t'+str(i)+'\t'+str(i+k)+'\t'+kmer)

        #if kmer in kmer_listRC:
        #    for group in kmergroupsRC[kmer]:
        #        print(chrr+'\t'+str(i)+'\t'+str(i+k)+'\t'+kmerRCpair[kmer])

