Paste Search Dynamic
word tokenize
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Tue Nov 10 14:50:20 2020
  4.  
  5. @author: b-ose
  6. """
  7.  
  8. #stamp  
  9. import nltk  
  10. from nltk.tokenize import sent_tokenize, word_tokenize
  11. from nltk.corpus import stopwords
  12. from sklearn.feature_extraction.text import TfidfVectorizer
  13.  
  14. texte = "A chief aim of the Constitution as drafted by the Convention was to create a government with enough power to act on a national level, but without so much power that fundamental rights would be at risk. One way that this was accomplished was to separate the power of government into three branches, and then to include checks and balances on those powers to assure that no one branch of government gained supremacy. This concern arose largely out of the experience that the delegates had with the King of England and his powerful Parliament. The powers of each branch are enumerated in the Constitution, with powers not assigned to them reserved to the states."
  15. texte.lower().split()
  16.  
  17. texte = texte.lower()
  18.  
  19. sentence_list = sent_tokenize(texte)
  20. print(sentence_list)
  21. word_list = word_tokenize(texte)
  22. print(word_list)
  23.  
  24.  
  25. liste_ponct = [",", ".", "!", "?", ";"]
  26.  
  27. #renvoie la liste sans les les stopwords
  28. word_to_del = liste_ponct + stopwords.words('english')
  29.  
  30. word_list = [word for word in word_list if word not in word_to_del]
  31.  
  32. #retourne une liste avec les éléments qui apparaissent une fois
  33. unique_word_list = list(set(word_list))
  34.  
  35.  
  36. #retourne le nombre d'occurence des éléments de la liste dans un dictionnaire
  37. word_occurence_dict = {}
  38.  
  39. for unique_word in unique_word_list :
  40.     i=0
  41.     for word in word_list:
  42.         if unique_word == word:
  43.             i+=1
  44.     word_occurence_dict[unique_word]=i
  45. print(word_occurence_dict)    
  46.  
  47.  #voir le radical des mots et les retirer  
  48. from nltk.stem import PorterStemmer
  49. ps = PorterStemmer()
  50. #words = ["friends","was","girls","sailor"]
  51.  
  52. #for w in words:
  53. #    print(w, " : ", ps.stem(w))
  54.    
  55. word_list_stem = [ps.stem(word) for word in word_list]
  56. print(word_list_stem)    
  57.  
  58.  
  59. #
  60. nltk.download('wordnet')
  61.  
  62. from nltk.stem import WordNetLemmatizer
  63. lemmatizer = WordNetLemmatizer()
  64. print(lemmatizer.lemmatize("caught"))
  65.  
  66. word_list_lem = [word for word in word_list]
  67. print(word_list_lem)
Parsed in 0.017 seconds