englishcorpus.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. import codecs
  2. import os
  3. import random
  4. PHRASES_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'phrases.txt')
  5. class EnglishCorpus(object):
  6. def __init__(self, phrases_file=PHRASES_FILE, min_length=None, max_length=None):
  7. self._countdown = 0
  8. self._previous = None
  9. self.phrases = []
  10. with codecs.open(phrases_file, "r", "utf-8") as f:
  11. for phrase in [l.strip() for l in f.readlines()]:
  12. if min_length and len(phrase) < min_length:
  13. continue
  14. if max_length and len(phrase) > max_length:
  15. continue
  16. self.phrases.append(phrase)
  17. def _countdown_to_shuffle(self):
  18. self._countdown -= 1
  19. if self._countdown < 0:
  20. self._countdown = random.randint(500, 1000)
  21. self.shuffle()
  22. def __len__(self):
  23. return len(self.phrases)
  24. def shuffle(self):
  25. random.shuffle(self.phrases)
  26. def random_choice(self):
  27. self._countdown_to_shuffle()
  28. choice = None
  29. while not choice or choice == self._previous:
  30. choice = random.choice(self.phrases)
  31. self._previous = choice
  32. return choice
  33. def random_sentences(self, no):
  34. self._countdown_to_shuffle()
  35. max_no = len(self) - no - 1
  36. start = random.randint(0, max_no)
  37. sentences = self.phrases[start:(start + no)]
  38. random.shuffle(sentences)
  39. return sentences