# -*- coding: utf-8 -*-importnumpyasnpfromcollectionsimportdefaultdictfromoperatorimportitemgetterif__name__=='__main__':dataset_filename="affinity_dataset.txt"X=np.loadtxt(dataset_filename)n_samples,n_features=X.shape# 样本数,特征数features=["bread","milk","cheese","apples","bananas"]# 商品名列表# 如果 xxx,那么 xxx 就是一条规则。规则由前提条件和结论两部分组成# 这里注意'如果买 A 则他们会买 B'和'如果买 B 则他们会买 A'不是一个规则,在下面的循环中体现出来valid_rules=defaultdict(int)# 规则应验invalid_rules=defaultdict(int)# 规则无效num_occurences=defaultdict(int)# 商品购买数量字典forsampleinX:# 对数据集里的每个消费者forpremiseinrange(n_features):ifsample[premise]==0:# 如果这个商品没有买,继续看下一个商品continuenum_occurences[premise]+=1# 记录这个商品购买数量forconclusioninrange(n_features):ifpremise==conclusion:# 跳过此商品continueifsample[conclusion]==1:valid_rules[(premise,conclusion)]+=1# 规则应验else:invalid_rules[(premise,conclusion)]+=1# 规则无效support=valid_rules# 支持度字典,即规则应验次数confidence=defaultdict(float)# 置信度字典forpremise,conclusioninvalid_rules.keys():# 条件/结论rule=(premise,conclusion)# 置信度 = 规则发生的次数/条件发生的次数confidence[rule]=valid_rules[rule]/num_occurences[premise]defprint_rule(premise,conclusion,support,confidence,features):premise_name=features[premise]conclusion_name=features[conclusion]print("Rule: If a person buys {0} they will also buy {1}".format(premise_name,conclusion_name))print(" - Confidence: {0:.3f}".format(confidence[(premise,conclusion)]))print(" - Support: {0}".format(support[(premise,conclusion)]))print("")# 得到支持度最高的规则,items() 返回字典所有元素的列表,itemgetter(1) 表示用支持度的值作为键,进行降序排列sorted_support=sorted(support.items(),key=itemgetter(1),reverse=True)foriinrange(5):print("Rule #{0}".format(i+1))premise,conclusion=sorted_support[i][0]print_rule(premise,conclusion,support,confidence,features)sorted_confidence=sorted(confidence.items(),key=itemgetter(1),reverse=True)foriinrange(5):print("Rule #{0}".format(i+1))premise,conclusion=sorted_confidence[i][0]print_rule(premise,conclusion,support,confidence,features)
Output:
Rule #1
Rule: If a person buys cheese they will also buy bananas
- Confidence: 0.659
- Support: 27
Rule #2
Rule: If a person buys bananas they will also buy cheese
- Confidence: 0.458
- Support: 27
Rule #3
Rule: If a person buys cheese they will also buy apples
- Confidence: 0.610
- Support: 25
Rule #1
Rule: If a person buys apples they will also buy cheese
- Confidence: 0.694
- Support: 25
Rule #2
Rule: If a person buys cheese they will also buy bananas
- Confidence: 0.659
- Support: 27
Rule #3
Rule: If a person buys bread they will also buy bananas
- Confidence: 0.630
- Support: 17
# -*- coding: utf-8 -*-importnumpyasnpfromsklearn.datasetsimportload_iris# Iris 植物分类数据集fromcollectionsimportdefaultdict# 初始化数据字典fromoperatorimportitemgetter# 得到一个列表的制定元素fromsklearn.model_selectionimporttrain_test_split# 将一个数据集且分为训练集和测试集fromsklearn.metricsimportclassification_report# 分析预测结果# 这里保留函数的文档方便查阅deftrain(X,y_true,feature):"""
Computes the predictors and error for a given feature using the OneR algorithm
Parameters
----------
X: array [n_samples, n_features]
The two dimensional array that holds the dataset. Each row is a sample, each column
is a feature.
y_true: array [n_samples,]
The one dimensional array that holds the class values. Corresponds to X, such that
y_true[i] is the class value for sample X[i].
feature: int
An integer corresponding to the index of the variable we wish to test.
0 <= variable < n_features
Returns
-------
predictors: dictionary of tuples: (value, prediction)
For each item in the array, if the variable has a given value, make the given prediction.
error: float
The ratio of training data that this rule incorrectly predicts.
"""# 检查是否为有效数字n_samples,n_features=X.shapeassert0<=feature<n_features# X[:, feature] 为 numpy 矩阵的索引用法,第一维:所有数组,第二维:feature,set 去重得到 value 有几个取值# 这个 feature 特征值在每个数据中有多少个取值values=set(X[:,feature])# Stores the predictors array that is returnedpredictors=dict()errors=[]# 对每个特征值的每个取值调用 train_feature_value 函数获得该取值出现最多的类和错误率forcurrent_valueinvalues:most_frequent_class,error=train_feature_value(X,y_true,feature,current_value)predictors[current_value]=most_frequent_class# 该取值出现最多的类errors.append(error)# 存储错误率total_error=sum(errors)# 返回预测方案(即 feature 的取值分别对应哪个类别)和总错误率returnpredictors,total_errordeftrain_feature_value(X,y_true,feature,value):class_counts=defaultdict(int)# Iterate through each sample and count the frequency of each class/value pair# 第 feature 个特征的值为 value 的时候,在每个种类中出现的次数,这里的植物有三个种类# 因此最终 class_counts 有三个键值对forsample,yinzip(X,y_true):ifsample[feature]==value:class_counts[y]+=1# 对 class_count 以 value 由大到小排列sorted_class_counts=sorted(class_counts.items(),key=itemgetter(1),reverse=True)most_frequent_class=sorted_class_counts[0][0]# 出现最多次的类n_samples=X.shape[1]error=sum([class_countforclass_value,class_countinclass_counts.items()ifclass_value!=most_frequent_class])# error 就是除去上面那个类的其它 value 的和returnmost_frequent_class,error# 返回出现次数最多的类和错误率defpredict(X_test,model):variable=model['variable']# 使用哪个 feature 作为 OneRule 进行预测predictor=model['predictor']# 一个字典,保存着 feature 取值对应哪一类y_predicted=np.array([predictor[int(sample[variable])]forsampleinX_test])returny_predicted# 返回预测结果if__name__=='__main__':dataset=load_iris()X=dataset.datay=dataset.targetn_samples,n_features=X.shape# 计算每个属性的均值attribute_means=X.mean(axis=0)assertattribute_means.shape==(n_features,)# 对数据集离散化X_d=np.array(X>=attribute_means,dtype='int')random_state=14X_train,X_test,y_train,y_test=train_test_split(X_d,y,random_state=random_state)# 分割训练集和测试集print("There are {} training samples".format(y_train.shape))# 训练集数量print("There are {} testing samples".format(y_test.shape))# 测试集数量# 对每个特征返回预测器和错误率 [0:{0: x, 1: x}, sum_error, ...]all_predictors={variable:train(X_train,y_train,variable)forvariableinrange(X_train.shape[1])}errors={variable:errorforvariable,(mapping,error)inall_predictors.items()}# 把每个预测器的值提取出来# 找出最好(错误最少)的那个 feature 构成的预测器best_variable,best_error=sorted(errors.items(),key=itemgetter(1))[0]print("The best model is based on variable {0} and has error {1:.2f}%".format(best_variable,best_error))# Choose the bset modelmodel={'variable':best_variable,'predictor':all_predictors[best_variable][0]}y_predicted=predict(X_test,model)print(classification_report(y_test,y_predicted))# 生成测试结果print(np.mean(y_predicted==y_test)*100)# 预测正确率
# -*- coding: utf-8 -*-importnumpyasnpimportcsvfrommatplotlibimportpyplotaspltfromsklearn.neighborsimportKNeighborsClassifier# 导入 K 近邻分类器fromsklearn.model_selectionimporttrain_test_splitfromsklearn.model_selectionimportcross_val_score# 导入交叉检验的# 把每个特征值的值域规范化到 0,1 之间,最小值用 0 代替,最大值用 1 代替fromsklearn.preprocessingimportMinMaxScalerfromsklearn.pipelineimportPipeline# 流水线if__name__=='__main__':# 数据集大小已知有 351 行,每行 35 个值前 34 个为天线采集的数据,最后一个 g/b 表示数据的好坏X=np.zeros((351,34),dtype='float')y=np.zeros((351,),dtype='bool')# 打开根目录的数据集文件withopen("ionosphere.data",'r',encoding='utf-8')asinput_file:# 创建 csv 阅读器对象reader=csv.reader(input_file)# 使用枚举函数为每行数据创建索引fori,rowinenumerate(reader):# 获取行数据的前 34 个值,并将其转化为浮点型,保存在 X 中data=[float(datum)fordatuminrow[:-1]]# Set the appropriate row in our datasetX[i]=data# 数据集# 1 if the class is 'g', 0 otherwisey[i]=row[-1]=='g'# 类别# 创建训练集和测试集X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=14)print("There are {} samples in the training dataset".format(X_train.shape[0]))print("There are {} samples in the testing dataset".format(X_test.shape[0]))print("Each sample has {} features".format(X_train.shape[1]))
Output:
There are 263 samples in the training dataset
There are 88 samples in the testing dataset
Each sample has 34 features
Input:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 初始化一个 K 近邻分类器实例,该算法默认选择 5 个近邻作为分类依据estimator=KNeighborsClassifier()# 用训练数据进行训练estimator.fit(X_train,y_train)# 使用测试集测试算法,评价其表现y_predicted=estimator.predict(X_test)# 准确性accuracy=np.mean(y_test==y_predicted)*100print("The accuracy is {0:.1f}%".format(accuracy))# 使用交叉检验的方式获得平均准确性scores=cross_val_score(estimator,X,y,scoring='accuracy')average_accuracy=np.mean(scores)*100print("The average accuracy is {0:.1f}%".format(average_accuracy))
Output:
The accuracy is 86.4%
The average accuracy is 82.6%
# 模拟脏数据X_broken=np.array(X)X_broken[:,::2]/=10# 对比两种情况下预测准确率estimator=KNeighborsClassifier()original_scores=cross_val_score(estimator,X,y,scoring='accuracy')print("The original average accuracy for is {0:.1f}%".format(np.mean(original_scores)*100))broken_scores=cross_val_score(estimator,X_broken,y,scoring='accuracy')print("The broken average accuracy for is {0:.1f}%".format(np.mean(broken_scores)*100))
Output:
The original average accuracy for is 82.6%
The broken average accuracy for is 73.8%
Input:
1
2
3
4
5
6
7
# 组合成为一个工作流X_transformed=MinMaxScaler.fit_transform(X_broken)# 完成训练和转换estimator=KNeighborsClassifier()transformed_scores=cross_val_score(estimator,X_transformed,y,scoring='accuracy')print("The average accuracy for is {0:.1f}%".format(np.mean(transformed_scores)*100))
# 创建流水线# 流水线的每一步都用 ('名称',步骤) 的元组表示scaling_pipeline=Pipeline([('scale',MinMaxScaler()),# 规范特征取值('predict',KNeighborsClassifier())])# 预测# 调用流水线scores=cross_val_score(scaling_pipeline,X_broken,y,scoring='accuracy')print("The pipelin scored an average accuracy for is {0:.1f}%".format(np.mean(scores)*100))
Output:
The pipelin scored an average accuracy for is 82.9%
Date Visitor Team VisitorPts ... Score Type OT? Notes
0 2013-10-29 Orlando Magic 87 ... Box Score NaN NaN
1 2013-10-29 Chicago Bulls 95 ... Box Score NaN NaN
2 2013-10-29 Los Angeles Clippers 103 ... Box Score NaN NaN
3 2013-10-30 Brooklyn Nets 94 ... Box Score NaN NaN
4 2013-10-30 Boston Celtics 87 ... Box Score NaN NaN
5 2013-10-30 Miami Heat 110 ... Box Score NaN NaN
[6 rows x 8 columns]
# 提取新特征,值为这场中主场队伍是否胜利results["HomeWin"]=results["VisitorPts"]<results["HomePts"]y_true=results["HomeWin"].values# 胜负情况# 创建两个新 feature,初始值都设为 0,保存这场比赛的两个队伍上场比赛的情况results["HomeLastWin"]=Falseresults["VisitorLastWin"]=Falsewon_last=defaultdict(int)forindex,rowinresults.iterrows():home_team=row["Home Team"]visitor_team=row["Visitor Team"]# 这场比赛之前两个球队上次是否获胜保存在 result 中row["HomeLastWin"]=won_last[home_team]row["VisitorLastWin"]=won_last[visitor_team]results.iloc[index]=row# 这场比赛的结果更新 won_last 中的情况won_last[home_team]=row["HomeWin"]won_last[visitor_team]=notrow["HomeWin"]X_previouswins=results[["HomeLastWin","VisitorLastWin"]].values# 创建决策树生成器实例clf=DecisionTreeClassifier(random_state=14)# 交叉训练scores=cross_val_score(clf,X_previouswins,y_true,scoring='accuracy')print("Using just the last result from the home and visitor teams")print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))
Output:
Using just the last result from the home and visitor teams
Accuracy: 56.4%
# 创建一个转化器实例encoding=LabelEncoder()# 将球队名转化为整型encoding.fit(results["Home Team"].values)# 抽取所有比赛中主客场球队的球队名,组合起来形成一个矩阵home_teams=encoding.transform(results["Home Team"].values)visitor_teams=encoding.transform(results["Visitor Team"].values)# 建立训练集,[["Home Team Feature","Visitor Team Feature"],["Home Team Feature","Visitor Team Feature"]...]X_teams=np.vstack([home_teams,visitor_teams]).T# 创建转化器实例onehot=OneHotEncoder()# 生成转化后的特征X_teams=onehot.fit_transform(X_teams).todense()clf=DecisionTreeClassifier(random_state=14)scores=cross_val_score(clf,X_teams,y_true,scoring='accuracy')print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))clf=RandomForestClassifier(random_state=14,n_jobs=-1)scores=cross_val_score(clf,X_teams,y_true,scoring='accuracy')print("Using full team labels is ranked higher")print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))
Output:
Accuracy: 60.5%
Using full team labels is ranked higher
Accuracy: 61.4%
将上面生成的特征整合起来,创建新的决策方案
这里使用 np.hstack()横向拼接两个决策方案矩阵
Input:
1
2
3
4
5
X_all=np.hstack([X_home_higher,X_teams])# 将上面计算的特征进行组合print(X_all.shape)scores=cross_val_score(clf,X_all,y_true,scoring='accuracy')print("Using whether the home team is ranked higher")print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))
Output:
(1319, 62)
Using whether the home team is ranked higher
Accuracy: 61.6%
#!/usr/bin/env python3# -*- coding: utf-8 -*-importdatetimeimportnumpyasnpimportpandasaspdfromcollectionsimportdefaultdictfromsklearn.model_selectionimportcross_val_scorefromsklearn.treeimportDecisionTreeClassifierfromsklearn.ensembleimportRandomForestClassifierfromch3.nba_testimportX_allfromsklearn.model_selectionimportGridSearchCV# 网格搜索,找到最佳参数if__name__=='__main__':"""
- 球队上次打比赛距今有多长时间?
- 两支球队过去五场比赛结果如何?
- 球队是不是跟某支特定球队打比赛时发挥更好?
"""dataset=pd.read_csv("NBA_data.csv",parse_dates=["Date"],skiprows=[0,],usecols=[0,2,3,4,5,6,7,9])# 加载数据集dataset.columns=["Date","Visitor Team","VisitorPts","Home Team","HomePts","Score Type","OT?","Notes"]dataset["HomeWin"]=dataset["VisitorPts"]<dataset["HomePts"]y_true=dataset["HomeWin"].values# 胜负情况# 保存上次打比赛的时间last_played_date=defaultdict(datetime.date)# 手动为每个球队初始化forteaminset(dataset["Home Team"]):last_played_date[team]=datetime.date(year=2013,month=10,day=25)# 两支球队过去的比赛结果,每个球队的数据是 [True,False,,,] 的序列last_five_games=defaultdict(list)# 存放 Home 和 Visitor 前五次比赛的获胜次数dataset["HWinTimes"]=0dataset["VWinTimes"]=0# 存放距离上次比赛的时间间隔,用天计数dataset["HLastPlayedSpan"]=0dataset["VLastPlayedSpan"]=0forindex,rowindataset.iterrows():home_team=row["Home Team"]visitor_team=row["Visitor Team"]row["HWinTimes"]=sum(last_five_games[home_team][-5:])row["VWinTimes"]=sum(last_five_games[visitor_team][-5:])row["HLastPlayedSpan"]=(row["Date"].date()-last_played_date[home_team]).daysrow["VLastPlayedSpan"]=(row["Date"].date()-last_played_date[visitor_team]).daysdataset.iloc[index]=rowlast_played_date[home_team]=row["Date"].date()last_played_date[visitor_team]=row["Date"].date()last_five_games[home_team].append(row["HomeWin"])last_five_games[visitor_team].append(notrow["HomeWin"])X_1=dataset[["HLastPlayedSpan","VLastPlayedSpan","HWinTimes","VWinTimes"]].valuesclf=DecisionTreeClassifier(random_state=14)scores=cross_val_score(clf,X_1,y_true,scoring='accuracy')print("DecisionTree: Using time span and win times")print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))clf=RandomForestClassifier(random_state=14,n_jobs=-1)scores=cross_val_score(clf,X_1,y_true,scoring='accuracy')print("RandomForest: Using time span and win times")print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))print("---------------------------------")X_all=np.hstack([X_1,X_all])clf=DecisionTreeClassifier(random_state=14)scores=cross_val_score(clf,X_all,y_true,scoring='accuracy')print("DecisionTree: Using time span and win times")print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))clf=RandomForestClassifier(random_state=14,n_jobs=-1)scores=cross_val_score(clf,X_all,y_true,scoring='accuracy')print("RandomForest: Using time span and win times")print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))print("---------------------------------")parameter_space={"max_features":[2,10,'auto'],"n_estimators":[100,],"criterion":["gini","entropy"],"min_samples_leaf":[2,4,6],}grid=GridSearchCV(clf,parameter_space)grid.fit(X_all,y_true)print("Accuracy: {0:.1f}%".format(grid.best_score_*100))print(grid.best_estimator_)
Output:
DecisionTree: Using time span and win times
Accuracy: 56.4%
RandomForest: Using time span and win times
Accuracy: 58.3%
---------------------------------
DecisionTree: Using time span and win times
Accuracy: 57.2%
RandomForest: Using time span and win times
Accuracy: 61.0%
---------------------------------
Accuracy: 64.6%
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='entropy', max_depth=None, max_features=2,
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=4, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=-1, oob_score=False, random_state=14, verbose=0,
warm_start=False)
# 字典保存最新发现的频繁项集frequent_itemsets={}min_support=50# 第一步,每一步电影生成只包含它自己的项集# frozenset() 返回一个冻结的集合,冻结后集合不能再添加或删除任何元素# 普通集合可变,集合中不能有可变的元素,因此普通集合不能被放在集合中;冻结集合不可变,因此可以被放入集合frequent_itemsets[1]=dict((frozenset((movie_id,)),row["Favorable"])formovie_id,rowinnum_favorable_by_movie.iterrows()ifrow["Favorable"]>min_support)# 会有重复,导致喜欢电影 1,50 的人分别为 50,100 但是 {1,50} 的集合有 100 个# 两个原因,第一在 current_superset 时项集有时候会突然调换位置deffind_frequent_itemsets(favorable_reviews_by_users,k_1_itemsets,min_support):counts=defaultdict(int)# 遍历每一个用户,获取其喜欢的电影foruser,reviewsinfavorable_reviews_by_users.items():# 遍历每个项集foritemsetink_1_itemsets:ifitemset.issubset(reviews):# 判断 itemset 是否是用户喜欢的电影的子集# 对用户喜欢的电影中除了这个子集的电影进行遍历forother_reviewed_movieinreviews-itemset:# 将该电影并入项集中current_superset=itemset|frozenset({other_reviewed_movie})counts[current_superset]+=1# 这个项集的支持度 +1# 返回元素数目 +1 的项集和数量res=dict([(itemset,frequency)foritemset,frequencyincounts.items()iffrequency>=min_support])returnresforkinrange(2,20):cur_frequent_itemsets=find_frequent_itemsets(favorable_reviews_by_users,frequent_itemsets[k-1],min_support)frequent_itemsets[k]=cur_frequent_itemsetsiflen(cur_frequent_itemsets)==0:print("Did not find any frequent itemsets of length {}".format(k))sys.stdout.flush()# 将缓冲区内容输出到终端,不宜多用,输出操作带来的计算开销会拖慢程序运行速度breakelse:print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets),k))sys.stdout.flush()# 除去只包含一个元素的初始集合delfrequent_itemsets[1]
Output:
I found 93 frequent itemsets of length 2
I found 295 frequent itemsets of length 3
I found 593 frequent itemsets of length 4
I found 785 frequent itemsets of length 5
I found 677 frequent itemsets of length 6
I found 373 frequent itemsets of length 7
I found 126 frequent itemsets of length 8
I found 24 frequent itemsets of length 9
I found 2 frequent itemsets of length 10
Did not find any frequent itemsets of length 11
# 计算置信度correct_counts=defaultdict(int)incorrect_counts=defaultdict(int)# 遍历每一个用户,获取其喜欢的电影foruser,reviewsinfavorable_reviews_by_users.items():# 遍历每个规则forcandidate_ruleincandidate_rules:# 获取规则的条件和结论premise,conclusion=candidate_rule# 如果条件是喜欢电影的子集(条件成立)ifpremise.issubset(reviews):# 如果用户也喜欢结论的电影ifconclusioninreviews:correct_counts[candidate_rule]+=1else:incorrect_counts[candidate_rule]+=1# 计算置信度,结论发生的次数除以条件发生的次数rule_confidence={candidate_rule:correct_counts[candidate_rule]/float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule])forcandidate_ruleincandidate_rules}# 给置信度排序sorted_confidence=sorted(rule_confidence.items(),key=itemgetter(1),reverse=True)forindexinrange(5):print("Rule #{}".format(index+1))(premise,conclusion)=sorted_confidence[index][0]print("Rule: If a person recommends {} they will also recommand {}".format(premise,conclusion))print("- Confidence: {0:.3f}".format(rule_confidence[(premise,conclusion)]))print("--------------------")
Output:
Rule #1
Rule: If a person recommends frozenset({98, 181}) they will also recommand 50
- Confidence: 1.000
--------------------
Rule #2
Rule: If a person recommends frozenset({172, 79}) they will also recommand 174
- Confidence: 1.000
--------------------
Rule #3
Rule: If a person recommends frozenset({258, 172}) they will also recommand 174
- Confidence: 1.000
--------------------
Rule #4
Rule: If a person recommends frozenset({1, 181, 7}) they will also recommand 50
- Confidence: 1.000
--------------------
Rule #5
Rule: If a person recommends frozenset({1, 172, 7}) they will also recommand 174
- Confidence: 1.000
--------------------
movie_name_data=pd.read_csv("ml-100k/u.item",delimiter='|',header=None,encoding="mac-roman")movie_name_data.columns=['MovieID','Title','Release Date','Video Release','IMDB','<UNK>','Action','Adventure','Animation',"Children's",'Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']forindexinrange(5):print('Rule #{0}'.format(index+1))(premise,conclusion)=sorted_confidence[index][0]premise_names=', '.join(get_movie_name(idx)foridxinpremise)conclusion_name=get_movie_name(conclusion)print('Rule: if a person recommends {0} they will also recommend {1}'.format(premise_names,conclusion_name))print(' - Confidence: {0:.3f}'.format(rule_confidence[(premise,conclusion)]))print("--------------------")
Output:
Rule #1
Rule: if a person recommends Silence of the Lambs, The (1991), Return of the Jedi (1983) they will also recommend Star Wars (1977)
- Confidence: 1.000
--------------------
Rule #2
Rule: if a person recommends Empire Strikes Back, The (1980), Fugitive, The (1993) they will also recommend Raiders of the Lost Ark (1981)
- Confidence: 1.000
--------------------
Rule #3
Rule: if a person recommends Contact (1997), Empire Strikes Back, The (1980) they will also recommend Raiders of the Lost Ark (1981)
- Confidence: 1.000
--------------------
Rule #4
Rule: if a person recommends Toy Story (1995), Return of the Jedi (1983), Twelve Monkeys (1995) they will also recommend Star Wars (1977)
- Confidence: 1.000
--------------------
Rule #5
Rule: if a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Raiders of the Lost Ark (1981)
- Confidence: 1.000
--------------------
# 评估测试test_dataset=all_ratings[~all_ratings['UserID'].isin(range(200))]test_favorable=test_dataset[test_dataset["Favorable"]]test_favorable_by_users=dict((k,frozenset(v.values))fork,vintest_favorable.groupby("UserID")["MovieID"])correct_counts=defaultdict(int)incorrect_counts=defaultdict(int)foruser,reviewsintest_favorable_by_users.items():forcandidate_ruleincandidate_rules:premise,conclusion=candidate_ruleifpremise.issubset(reviews):ifconclusioninreviews:correct_counts[candidate_rule]+=1else:incorrect_counts[candidate_rule]+=1test_confidence={candidate_rule:correct_counts[candidate_rule]/float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule])forcandidate_ruleinrule_confidence}forindexinrange(5):print("Rule #{0}".format(index+1))(premise,conclusion)=sorted_confidence[index][0]premise_names=", ".join(get_movie_name(idx)foridxinpremise)conclusion_name=get_movie_name(conclusion)print('Rule: if a person recommends {0} they will also recommend {1}'.format(premise_names,conclusion_name))print(' - Confidence: {0:.3f}'.format(rule_confidence[(premise,conclusion)]))print("--------------------")
Output:
Rule #1
Rule: if a person recommends Silence of the Lambs, The (1991), Return of the Jedi (1983) they will also recommend Star Wars (1977)
- Confidence: 1.000
--------------------
Rule #2
Rule: if a person recommends Empire Strikes Back, The (1980), Fugitive, The (1993) they will also recommend Raiders of the Lost Ark (1981)
- Confidence: 1.000
--------------------
Rule #3
Rule: if a person recommends Contact (1997), Empire Strikes Back, The (1980) they will also recommend Raiders of the Lost Ark (1981)
- Confidence: 1.000
--------------------
Rule #4
Rule: if a person recommends Toy Story (1995), Return of the Jedi (1983), Twelve Monkeys (1995) they will also recommend Star Wars (1977)
- Confidence: 1.000
--------------------
Rule #5
Rule: if a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Raiders of the Lost Ark (1981)
- Confidence: 1.000
--------------------
# -*- coding: utf-8 -*-fromcollectionsimportCounterif__name__=='__main__':s="""Three Rings for the Elven-kings under the sky,
Seven for the Dwarf-lords in halls of stone,
Nine for Mortal Men, doomed to die,
One for the Dark Lord on his dark throne
In the Land of Mordor where the Shadows lie.
One Ring to rule them all, One Ring to find them,
One Ring to bring them all and in the darkness bind them.
In the Land of Mordor where the Shadows lie""".lower()words=s.split()c=Counter(words)# 输出出现次数最多的前 5 个词print(c.most_common(5))
# 获得两个单词的编辑距离steps=edit_distance("STEP","STOP")print("The num of steps needed is: {}".format(steps))# 用词长 4 减去同等位置上相同的字母数量,得到的值越小表示两个词相似度越高defcompute_distance(prediction,word):returnlen(prediction)-sum(prediction[i]==word[i]foriinrange(len(prediction)))# 改进预测函数defimproved_prediction(word,net,dictionary,shear=0.2):captcha=create_captcha(word,shear=shear)prediction=predict_captcha(captcha,net)prediction=prediction[:4]# 如果单词不在词典中则比较取词典中距离最小的单词ifpredictionnotindictionary:distance=sorted([(w,compute_distance(prediction,w))forwindictionary],key=itemgetter(1))best_word=distance[0]prediction=best_word[0]returnword==prediction,word,predictionnum_correct=0num_incorrect=0forwordinvalid_words:correct,word,prediction=improved_prediction(word,net,valid_words,shear=0.2)ifcorrect:num_correct+=1else:num_incorrect+=1print("Number correct is {}".format(num_correct))print("Number incorrect is {}".format(num_incorrect))
Output:
The num of steps needed is: 1
Number correct is 3785
Number incorrect is 1728
# -*- coding: utf-8 -*-# get_data.pyimportrequestsimportosimporttimefromcollectionsimportdefaultdicttitles={'burton':[4657,2400,5760,6036,7111,8821,18506,4658,5761,6886,7113],'dickens':[24022,1392,1414,1467,2324,580,786,888,963,27924,1394,1415,15618,25985,588,807,914,967,30127,1400,1421,16023,28198,644,809,917,968,1023,1406,1422,17879,30368,675,810,924,98,1289,1413,1423,17880,32241,699,821,927],'doyle':[2349,11656,1644,22357,2347,290,34627,5148,8394,26153,12555,1661,23059,2348,294,355,5260,8727,10446,126,17398,2343,2350,3070,356,5317,903,10581,13152,2038,2344,244,32536,423,537,108,139,2097,2345,24951,32777,4295,7964,11413,1638,21768,2346,2845,3289,439,834],'gaboriau':[1748,1651,2736,3336,4604,4002,2451,305,3802,547],'nesbit':[34219,23661,28804,4378,778,20404,28725,33028,4513,794],'tarkington':[1098,15855,1983,297,402,5798,8740,980,1158,1611,2326,30092,483,5949,8867,13275,18259,2595,3428,5756,6401,9659],'twain':[1044,1213,245,30092,3176,3179,3183,3189,74,86,1086,142,2572,3173,3177,3180,3186,3192,76,91,119,1837,2895,3174,3178,3181,3187,3432,8525]}assertlen(titles)==7assertlen(titles['tarkington'])==22assertlen(titles['dickens'])==44assertlen(titles['nesbit'])==10assertlen(titles['doyle'])==51assertlen(titles['twain'])==29assertlen(titles['burton'])==11assertlen(titles['gaboriau'])==10url_base='http://www.gutenberg.org/files/'url_format='{url_base}{id}/{id}-0.txt'# 修复 URLurl_fix_format='http://www.gutenberg.org/cache/epub/{id}/pg{id}.txt'fiexes=defaultdict(list)# fixes = {}# fixes[4657] = 'http://www.gutenberg.org/cache/epub/4657/pg4657.txt'# make parent folder if not exists# data_folder = os.path.join(os.path.expanduser('~'),'Data','books') ## 这是在用户 user 目录中存储data_folder=os.path.dirname(os.path.abspath(__file__))if__name__=='__main__':ifnotos.path.exists(data_folder):os.makedirs(data_folder)print(data_folder)forauthorintitles:print('Downloading titles from',author)# make author's folder if not existsauthor_folder=os.path.join(data_folder,author)ifnotos.path.exists(author_folder):os.makedirs(author_folder)# download each title to this folderforbookidintitles[author]:# if bookid in fixes:# print(' - Applying fix to book with id', bookid)# url = fixes[bookid]# else:# print(' - Getting book with id', bookid)# url = url_format.format(url_base=url_base, id=bookid)url=url_format.format(url_base=url_base,id=bookid)print(' - ',url)filename=os.path.join(author_folder,'%s.txt'%bookid)ifos.path.exists(filename):print(' - File already exists, skipping')else:r=requests.get(url)ifr.status_code==404:print('url 404:',author,bookid,'add to fixes list')fiexes[author].append(bookid)else:txt=r.textwithopen(filename,'w',encoding='utf-8')asf:f.write(txt)time.sleep(1)print('Download complete')print('开始下载修复列表')forauthorinfiexes:print('开始下载<%s>的作品'%author)author_folder=os.path.join(data_folder,author)ifnotos.path.exists(author_folder):os.makedirs(author_folder)forbookidinfiexes[author]:filename=os.path.join(author_folder,'%s.txt'%bookid)ifos.path.exists(filename):print('文件已经下载,跳过')else:url_fix=url_fix_format.format(id=bookid)print(' - ',url_fix)r=requests.get(url_fix)ifr.status_code==404:print('又出错了!',author,bookid)else:withopen(filename,'w',encoding='utf-8')asf:f.write(r.text)time.sleep(1)print('修复列表下载完毕')
# -*- coding: utf-8 -*-# author_test.pyimportosimportnumpyasnpfromsklearn.feature_extraction.textimportCountVectorizerfromsklearn.svmimportSVC# 支持向量机fromsklearn.model_selectionimportcross_val_scorefromsklearn.pipelineimportPipelinefromsklearn.model_selectionimportGridSearchCVfromch9importgetdata# 去掉古藤堡的说明defclean_book(document):lines=document.split("\n")start=0end=len(lines)foriinrange(len(lines)):line=lines[i]ifline.startswith("*** START OF THIS PROJECT GUTENBERG"):start=i+1elifline.startswith("*** END OF THIS PROJECT GUTENBERG"):end=i-1return"\n".join(lines[start:end])defload_books_data(folder=getdata.data_folder):# 存储文档和作者documents=[]authors=[]# 遍历子文件夹subfolders=[subfolderforsubfolderinos.listdir(folder)ifos.path.isdir(os.path.join(folder,subfolder))]forauthor_number,subfolderinenumerate(subfolders):full_subfolder_path=os.path.join(folder,subfolder)fordocument_nameinos.listdir(full_subfolder_path):# 跳过目录下的 getdata.py 文件ifdocument_name=='getdata.cpython-38.pyc':continuewithopen(os.path.join(full_subfolder_path,document_name),'r')asinf:documents.append(clean_book(inf.read()))authors.append(author_number)returndocuments,np.array(authors,'int')# 功能词function_words=["a","able","aboard","about","above","absent","according","accordingly","across","after","against","ahead","albeit","all","along","alongside","although","am","amid","amidst","among","amongst","amount","an","and","another","anti","any","anybody","anyone","anything","are","around","as","aside","astraddle","astride","at","away","bar","barring","be","because","been","before","behind","being","below","beneath","beside","besides","better","between","beyond","bit","both","but","by","can","certain","circa","close","concerning","consequently","considering","could","couple","dare","deal","despite","down","due","during","each","eight","eighth","either","enough","every","everybody","everyone","everything","except","excepting","excluding","failing","few","fewer","fifth","first","five","following","for","four","fourth","from","front","given","good","great","had","half","have","he","heaps","hence","her","hers","herself","him","himself","his","however","i","if","in","including","inside","instead","into","is","it","its","itself","keeping","lack","less","like","little","loads","lots","majority","many","masses","may","me","might","mine","minority","minus","more","most","much","must","my","myself","near","need","neither","nevertheless","next","nine","ninth","no","nobody","none","nor","nothing","notwithstanding","number","numbers","of","off","on","once","one","onto","opposite","or","other","ought","our","ours","ourselves","out","outside","over","part","past","pending","per","pertaining","place","plenty","plethora","plus","quantities","quantity","quarter","regarding","remainder","respecting","rest","round","save","saving","second","seven","seventh","several","shall","she","should","similar","since","six","sixth","so","some","somebody","someone","something","spite","such","ten","tenth","than","thanks","that","the","their","theirs","them","themselves","then","thence","therefore","these","they","third","this","those","though","three","through","throughout","thru","thus","till","time","to","tons","top","toward","towards","two","under","underneath","unless","unlike","until","unto","up","upon","us","used","various","versus","via","view","wanting","was","we","were","what","whatever","when","whenever","where","whereas","wherever","whether","which","whichever","while","whilst","who","whoever","whole","whom","whomever","whose","will","with","within","without","would","yet","you","your","yours","yourself","yourselves"]if__name__=='__main__':# 获取数据documents,classes=load_books_data(getdata.data_folder)# 提取特征词extractor=CountVectorizer(vocabulary=function_words)# 参数字典parameters={'kernel':('linear','rbf'),'C':[1,10]}svr=SVC()# 使用网格搜索最优参数值grid=GridSearchCV(svr,parameters)# 使用功能词分类pipeline1=Pipeline([('feature_extraction',extractor),('clf',grid)])scores=cross_val_score(pipeline1,documents,classes,scoring='f1_macro')print(np.mean(scores))
Output:
0.7738985477640941
Score: 0.813
N 元语法
N 元语法由一系列的 N 个为一组的对象组成,N 为每组对象的个数
Input:
1
2
3
4
5
6
# 用 N 元语法分类pipeline=Pipeline([('feature_extraction',CountVectorizer(analyzer='char',ngram_range=(3,3))),# 长度为 3 的 N 元语法('classifier',grid)])scores=cross_val_score(pipeline,documents,classes,scoring='f1_macro')print("Score: {:.3f}".format(np.mean(scores)))
random_state=check_random_state(14)letters=list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")assertlen(letters)==26shear_values=np.arange(0,0.8,0.05)scale_values=np.arange(0.9,1.1,0.1)# 随机生成一个字母的图片defgenerate_sample(random_state=None):random_state=check_random_state(random_state)letter=random_state.choice(letters)shear=random_state.choice(shear_values)scale=random_state.choice(scale_values)return(create_captcha(letter,shear=shear,size=(30,30),scale=scale),letters.index(letter),)image,target=generate_sample(random_state)plt.imshow(image,cmap="Greys")print("The target for this image is: {0}".format(letters[target]))
pydev debugger: process 7566 is connecting
pydev debugger: process 7556 is connecting
pydev debugger: process 7552 is connecting
pydev debugger: process 7561 is connecting
[('From:', 11536), ('lerxst@wam.umd.edu', 2), ("(where's", 3), ('my', 7679), ('thing)', 9)]
280308
---------- 3.5958340090001