## procedures necessary for DNA string matching.

##  substring search.

# Write a function to generate a random string of length n consisting of
# the characters A, C, G, T.

# Opertions for converting between strings and mutable arrays of characters:
# x = "abc"
# x = list("abc")   # to array
# s = "".join(x)        # back to string
# This operations are all non-destructive.

### For DNA matching, 
### we want to see if one sequence (a) is a found in another sequence (b),
# while tolerating up to a maximum number of errors. We ultimately want to 
# also know the position in b where the best match (with the fewest number of
# errors) occurred.  We will do incrementally:

#a. determine if a is a substring of b
#b. return the starting position in b the substring a is found,
#c. rewrite function so that a certain number of errors are tolerated
#d. write bestmatch(a,b,me), which will return the starting position
#   in b where a is found, with the least number of errors <= me,
#   which is the maximum number of errors allowed.

# a:  Key is recognizing that this is a -Exists-Forall type of loop:
# a is a substring of b iff there exists an index k in b such that
# for all 0<=i<len(a), a[i]==b[k+i]. Going into more detail, we should
# see that k should range from 0 to len(b)-len(a)
def substring(a,b):  # True iff a is a substring of b
  ax = False # accumulator for the outer there-exists loop
  k = 0  # indexes b
  while k<=len(b)-len(a) and ax==False:
    # test to see if a is found in b starting at index k in b

    bx = True # accumulator for inner forall loop
    i = 0  # indexes a
    while i<len(a) and bx==True:
      if not(a[i]==b[k+i]): bx = False
      i+=1
    # forall inner loop

    if (bx): ax = True   #could have combined ax,bx, but structure not as clear
    k = k+1
  # exists outer loop
  return ax
# substring

# One of the trickiest components to write was k<=len(b)-len(a).  To understand
# this, you can either try to construct and example, or deduce as follows:
# We know that any number used to index b must be >=0 and < len(b).
# Inside the inner while loop, we index b using the expression b[k+i].
# Thus we need to make sure that k+i < len(b).  But the largest possible value
# of i is len(a)-1, so we just need to make sure that k+len(a)-1 < len(b).
# But this is the same as k<len(b)-len(a)+1, which is the same as
# k<=len(b)-len(a).


print(substring("cde","abcdef"))

#b.  return position (k) in b where a was found, -1 if a is not substring of b
def substringat(a,b):  
  ax = -1  # replaces False as the "pessimistic" assumption
  k = 0  # indexes b
  while k<=len(b)-len(a) and ax ==-1:
    # test to see if a is found in b starting at index k in b

    bx = True # accumulator for inner forall loop
    i = 0  # indexes a
    while i<len(a) and bx==True:
      if not(a[i]==b[k+i]): bx = False
      i+=1
    # forall inner loop
    if (bx): ax = k   # could have combined ax,bx, but structure not as clear

    k = k+1
  # exists outer loop
  return ax
# substring

#c. For the next refinement, we are going to tolerate up to a maximum
#   number of errors (mismatched characters).  Also, we want to return
#   not just the position of the match, but also the number of errors in
#   the match: 0 will mean exact match, and maxerrors+1 for unacceptable match.

#   Find first occurrence of a in b that tolerates up to maxerrors.
#   returns (position,errors), or (-1,maxerrors+1) if none found.
def approxmatch(a,b,maxerrors):
  ai,ae = -1, maxerrors+1  # replaces False, represents worst case scenario
  k = 0  # indexes b
  while k<=len(b)-len(a) and ae>maxerrors:
    # test to see if a is found in b starting at index k in b

    bx = 0  # bx counts number of errors, 0 is the most optimistic assumption
    i = 0  # indexes a
    while i<len(a) and bx<=maxerrors:
      if not(a[i]==b[k+i]): bx += 1  # found one error
      i += 1
    # forall inner loop
    if (bx<=maxerrors): ai,ae = k,bx

    k = k+1
  # exists outer
  return (ai,ae)
#approxmatch

print(approxmatch("ACTG","TCACGGTAG",1))  # prints (2,1)

# HOWEVER:
print(approxmatch("ACTG","TCACGGTACTG",1))  
# will also prints (2,1) even though there's a match with no errors at the end.

### We saw that in step b, we replaced bx with a numerical value representing
# the position of match, and in step c, we replaced ax with a numerical value
# representing the number of errors in the current substring in b being tested
# for a match.  The True/False logic remains, but has been enhanced so that
# the function computes more information.  The next step is up to you, and
# will require a few further modifications of the above function.

# Now for what we really want: return position of best match that
# tolerates up to maxerrors.  The function should again return the
# positions and number of errors in the best match.  For example,
# bestmatch("ACTG","TCACGGTACTG",1) should return (7,0).