import numpy as np from sklearn import datasets from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB from sklearn.model_selection import train_test_split X,y = datasets.load_iris(return_X_y=True)
6.2、高斯分布朴素贝叶斯表现
1 2 3 4 5 6 7 8 9 10
score = 0 model = GaussianNB() for i inrange(100): X_train,X_test,y_train,y_test = train_test_split(X,y) model.fit(X_train,y_train) score += model.score(X_test,y_test)/100 print('高斯朴素贝叶斯模型平均预测准确率:',score) ''' 高斯朴素贝叶斯模型平均预测准确率: 0.9557894736842099 '''
6.3、伯努利分布朴素贝叶斯表现
1 2 3 4 5 6 7 8 9 10
score = 0 model = BernoulliNB() for i inrange(100): X_train,X_test,y_train,y_test = train_test_split(X,y) model.fit(X_train,y_train) score += model.score(X_test,y_test)/100 print('伯努利朴素贝叶斯模型平均预测准确率:',score) ''' 伯努利朴素贝叶斯模型平均预测准确率: 0.26105263157894737 '''
6.4、多项式分布朴素贝叶斯表现
1 2 3 4 5 6 7 8 9 10
score = 0 model = MultinomialNB() for i inrange(100): X_train,X_test,y_train,y_test = train_test_split(X,y) model.fit(X_train,y_train) score += model.score(X_test,y_test)/100 print('多项式朴素贝叶斯模型平均预测准确率:',score) ''' 多项式朴素贝叶斯模型平均预测准确率: 0.8255263157894736 '''
import jieba import numpy as np data = ['My dog ate my homework.','My cat ate the fish.', 'Precious things are very few in the world,that is the reason there is only one you!']
result = [] for s in data: result.extend([i for i in jieba.lcut(s) if i notin [' ',',','.','!']]) result = np.array(result)
result = np.unique(result) print(result) for s in data: word_embedding = [(i == result).astype(np.int8) for i in jieba.lcut(s) if i notin [' ',',','.','!']] print(np.array(word_embedding))
import jieba import numpy as np data = ['喜欢上一个人','尼姑亲吻了和尚的嘴唇','老师你教的都是没有的东西']
result = [] for s in data: result.extend([i for i in jieba.lcut(s)]) result = np.array(result)
result = np.unique(result) print(result) for s in data: word_embedding = [(i == result).astype(np.int8) for i in jieba.lcut(s) if i notin [' ',',','.','!']] print(np.array(word_embedding))
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer import jieba import numpy as np
X_test = ['Your free ringtone is waiting to be collected. Simply text the password "MIX" to 85069 to verify.I see the letter B on my car Please call now 08000930705 for delivery tomorrow', 'Precious things are very few in the world,that is the reason there is only one you', "GENT! We are trying to contact you. Last weekends draw shows that you won a £1000 prize GUARANTEED. U don't know how stubborn I am. Congrats! 1 year special cinema pass for 2 is yours.", 'Congrats! 1 year special cinema pass for 2 is yours. call 09061209465 now! C Suprman V, Matrix3, StarWars3, etc all 4 FREE! bx420-ip4-5we. 150pm. Dont miss out!']
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB from sklearn import datasets # countVectorizer词频 # tf-idf term frequency(词频) inverse document frequency(你文本词频) + 权重 from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,ENGLISH_STOP_WORDS from sklearn.model_selection import train_test_split # 会存储到本地文件 news = datasets.fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes'))
9.2、文本数据转换
1 2 3 4 5 6 7
'''Convert a collection of raw documents to a matrix of TF-IDF features. Equivalent to :class:`CountVectorizer` followed by :class:`TfidfTransformer`.''' tf_idf = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS) X_tf_idf = tf_idf.fit_transform(X) X_tf_idf X_train,X_test,y_train,y_test = train_test_split(X_tf_idf,y,test_size = 0.2)
smoking 表示吸烟,其概率用 P(S) 表示,lung Cancer 表示肺癌,一个人在吸烟的情况下得肺癌的概率用P(C|S) 表示,X-ray 表示需要照医学上的 X 光,肺癌可能会导致需要照 X 光,吸烟也有可能会导致需要照 X 光(所以 smoking 也是 X-ray 的一个因素),所以,因吸烟且得肺癌而需要照X光的概率用 P(X|C,S) 表示。
Bronchitis 表示支气管炎,一个人在吸烟的情况下得支气管炎的概率用 P(B|S),Dyspnoea 表示呼吸困难,支气管炎可能会导致呼吸困难,肺癌也有可能会导致呼吸困难(所以 lung Cancer 也是Dyspnoea的一个因素),因吸烟且得了支气管炎导致呼吸困难的概率用P(D|S,B)表示。
lung Cancer 简记为 C,Bronchitis 简记为 B,Dyspnoea 简记为 D,且 C = 0 表示 lung Cancer 不发生的概率,C = 1表示 lung Cancer 发生的概率,其他含义类似。