from textSpaceVector import textVector,textVectorCollection


class summarizationText(textVectorCollection):
  '''class that represents a text to be summarized'''

  def __init__(self,file,tokenPattern=r'\w+',sentenceSeparator='.',stopWordFile=None):
    '''instanciates summarizationText
    cuts text file $file$ in sentences with by means of regex $sentenceSeparator$
    loads sentences as textVector by tokenizing them with regex $tokenPattern$ 
    puts them in the collection
    loads a stop word list from file $stopWordFile$'''

    self.__stopWords = {} # dictionary of stopwords (key: stop word; value:1)
    textVectorCollection.__init__(self)
    f = open(file, 'rU')
    text = f.read().lower()
    f.close()
    sentences = text.split(sentenceSeparator)
    for sentence in sentences:
      textVectorCollection.addVector(self,textVector(text=sentence,pattern=tokenPattern,name=sentence))

    if stopWordFile is not None:
      self.__loadStopWordList(stopWordFile)

  def __loadStopWordList(self,file):
    '''returns the stop words contained in $file$ in the form of a dictionary of stop words (key:word, value:1)'''
    f = open(file,'rU')
    words = f.readLines()
    f.close()
    for w in words:
      self.__stopWords[w.rstrip().lower()] = 1

  def __getRelevancyWeight(self,vector,collection=None):
    w = 0.0
    tokens = vector.getTokens()
    n = 0.0
    for t in tokens.keys():
      #if self.__stopWords.get(t,None) is not None:
      w += vector.getWeight(t,with_freq=True,with_idf=True,collection=collection)
      n += vector.freq(t)     
    return w/n

  def summarize(self,ratio=0.2,collection = None):
    '''summarizes text with a reduction ratio of $ratio$    
    use of $collection$ to compute tf.idf'''
    scores = {}
    sentences = []
    indexes = {}

    for i in range(0,self.len()):
      sentence = textVectorCollection.get(self,i)
      scores[sentence] = self.__getRelevancyWeight(sentence,collection=collection)
      #print sentence.getId()+'('+i.__str__()+')'+'='+scores[sentence].__str__()
      indexes[sentence] = i
      sentences.append(sentence)
    def sorter(x,y):
      return -cmp(scores[x],scores[y])   
    sentences.sort(cmp=sorter)
    indMax = (int)(ratio*self.len() + 1)
    temp = []
    for j in range(0,indMax):
      s = sentences[j]
      temp.append(indexes[s])
    temp.sort()
    for j in range(0,indMax):
      for k in range(0,indMax):
         if indexes[sentences[k]] is temp[j]:
            print sentences[k].getId()











