|
| 1 | +<!DOCTYPE html> |
| 2 | +<html lang="en"> |
| 3 | +<head> |
| 4 | +<meta charset="UTF-8"> |
| 5 | +<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| 6 | +<title>stats - Statistics, linear modelling</title> |
| 7 | +</head> |
| 8 | +<body> |
| 9 | +<h1>stats - Statistics, linear modelling</h1> |
| 10 | + |
| 11 | +This demo illustrates the <a href="https://metacpan.org/pod/PDL::Stats">PDL::Stats</a> module, |
| 12 | +which lets you analyse statistical data in a number of ways. |
| 13 | +<pre> |
| 14 | +use <a href="https://metacpan.org/pod/PDL::Stats">PDL::Stats</a>; |
| 15 | +$w = pgswin(); # <a href="https://metacpan.org/pod/PDL::Graphics::Simple">PDL::Graphics::Simple</a> window |
| 16 | +srandom(5); # for reproducibility |
| 17 | +</pre> |
| 18 | + |
| 19 | +<hr/> |
| 20 | +First, <a href="https://metacpan.org/pod/PDL::Stats::TS">PDL::Stats::TS</a> - let's show three sets of random data, against |
| 21 | +the de-seasonalised version |
| 22 | +<pre> |
| 23 | +$data = random(12, 3); |
| 24 | +$data->plot_dseason( 12, { win=>$w } ); |
| 25 | +</pre> |
| 26 | +<h4>Output</h4> |
| 27 | + |
| 28 | +<img src="images/demos/stats/output-1.png"/> |
| 29 | + |
| 30 | +<hr/> |
| 31 | +Now let's show the seasonal means of that data |
| 32 | +<pre> |
| 33 | +($m, $ms) = $data->season_m( 6, { plot=>1, win=>$w } ); |
| 34 | +print "m=$m\nms=$ms"; |
| 35 | +</pre> |
| 36 | +<h4>Output</h4> |
| 37 | +<pre> |
| 38 | +m= |
| 39 | +[ |
| 40 | + [ 0.63660469 0.21671189 0.31362712 0.65773141 0.44697942 0.44024627] |
| 41 | + [ 0.77400156 0.24928771 0.54476702 0.42999148 0.80334015 0.63420928] |
| 42 | + [ 0.89574201 0.28036099 0.43392446 0.31499982 0.16980532 0.29458511] |
| 43 | +] |
| 44 | + |
| 45 | +ms= |
| 46 | +[ |
| 47 | + [ 0.022649418 0.027603922 0.065493262 5.6366038e-06 0.0097363653 0.053513471] |
| 48 | + [ 0.026704068 0.026384061 0.18308074 0.14584601 0.012937776 0.081424333] |
| 49 | + [ 0.0031311796 0.011825565 0.012130112 0.043369421 0.00970367 7.9067403e-05] |
| 50 | +] |
| 51 | +</pre> |
| 52 | +<img src="images/demos/stats/output-2.png"/> |
| 53 | + |
| 54 | +<hr/> |
| 55 | +Now, auto-correlation of a random sound-sample. |
| 56 | +See <a href="https://pdl.perl.org/advent/blog/2024/12/15/pitch-detection/">https://pdl.perl.org/advent/blog/2024/12/15/pitch-detection/</a> for more! |
| 57 | +<pre> |
| 58 | +random(100)->plot_acf( 50, { win=>$w } ); |
| 59 | +</pre> |
| 60 | +<h4>Output</h4> |
| 61 | + |
| 62 | +<img src="images/demos/stats/output-3.png"/> |
| 63 | + |
| 64 | +<hr/> |
| 65 | +<a href="https://metacpan.org/pod/PDL::Stats::Kmeans">PDL::Stats::Kmeans</a> clusters data points into "k" (a supplied number) groups |
| 66 | +<pre> |
| 67 | +$data = grandom(200, 2); # two rows = two dimensions |
| 68 | +%k = $data->kmeans; # use default of 3 clusters |
| 69 | +print "$_\t@{[$k{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %k; |
| 70 | +$w->plot( |
| 71 | + (map +(with=>'points', style=>$_+1, ke=>"Cluster ".($_+1), |
| 72 | + $data->dice_axis(0,which($k{cluster}->slice(",$_")))->dog), |
| 73 | + 0 .. $k{cluster}->dim(1)-1), |
| 74 | + (map +(with=>'circles', style=>$_+1, ke=>"Centroid ".($_+1), $k{centroid}->slice($_)->dog, 0.1), |
| 75 | + 0 .. $k{centroid}->dim(0)-1), |
| 76 | + {le=>'tr'}, |
| 77 | +); |
| 78 | +</pre> |
| 79 | +<h4>Output</h4> |
| 80 | +<pre> |
| 81 | +R2 0.562033919028588 |
| 82 | +centroid [ |
| 83 | + [ -1.1856909 0.045429685 0.76406539] |
| 84 | + [ 0.54058456 -1.07968 0.4569263] |
| 85 | +] |
| 86 | +cluster [ |
| 87 | + [0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 1 1 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0] |
| 88 | + [1 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 0 1 0 1 1 0 0 1 1 1 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 1 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1] |
| 89 | + [0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 0 0 0 0 0 1 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0 1 0 0 0] |
| 90 | +] |
| 91 | +n [59 71 70] |
| 92 | +ss [ |
| 93 | + [ 26.674205 40.003228 26.652344] |
| 94 | + [ 36.077374 23.989678 31.197273] |
| 95 | +] |
| 96 | +</pre> |
| 97 | +<img src="images/demos/stats/output-4.png"/> |
| 98 | + |
| 99 | +<hr/> |
| 100 | +There's also a principal component analysis (PCA) clustering function |
| 101 | +<pre> |
| 102 | +$data = qsort random 10, 5; # 10 obs on 5 variables |
| 103 | +%r = $data->pca( { plot=>1, win=>$w } ); |
| 104 | +</pre> |
| 105 | +Here we can see that very nearly all the variance is in the first component. |
| 106 | +<h4>Output</h4> |
| 107 | + |
| 108 | +<img src="images/demos/stats/output-5.png"/> |
| 109 | + |
| 110 | +<hr/> |
| 111 | +From that PCA we can plot the original vs PCA-transformed scores |
| 112 | +along the first two components |
| 113 | +<pre> |
| 114 | +$data->plot_scores( $r{eigenvector}, {win=>$w} ); |
| 115 | +</pre> |
| 116 | +<h4>Output</h4> |
| 117 | + |
| 118 | +<img src="images/demos/stats/output-6.png"/> |
| 119 | + |
| 120 | +<hr/> |
| 121 | +Suppose this is a person's ratings for top 10 box office movies |
| 122 | +ascending sorted by box office |
| 123 | +<pre> |
| 124 | +$y = pdl '[1 1 2 2 2 2 4 4 5 5]'; |
| 125 | +$x = cat sequence(10), sequence(10)**2; # IV with linear and quadratic component |
| 126 | +</pre> |
| 127 | +We do an ordinary least squares (OLS), or multiple linear regression, |
| 128 | +to get the underlying linear model. Here we also plot how far the real |
| 129 | +data was from our calculated model. |
| 130 | +<pre> |
| 131 | +%m = $y->ols( $x, { plot=>1, win=>$w } ); |
| 132 | +print "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m; |
| 133 | +</pre> |
| 134 | +<h4>Output</h4> |
| 135 | +<pre> |
| 136 | +F 40.4225352112676 |
| 137 | +F_df [2 7] |
| 138 | +F_p 0.000142834216344756 |
| 139 | +R2 0.920314253647587 |
| 140 | +b [0.981818181818215 0.212121212121208 0.0303030303030304] |
| 141 | +b_p [0.0399105086044558 0.328001176835915 0.203034040562643] |
| 142 | +b_se [0.389875811466657 0.201746929168937 0.021579988884181] |
| 143 | +b_t [2.51828442017153 1.05142225953578 1.40421899499882] |
| 144 | +ss_model 19.8787878787879 |
| 145 | +ss_residual 1.72121212121212 |
| 146 | +ss_total 21.6 |
| 147 | +y_pred [0.981818181818215 1.22424242424245 1.52727272727275 1.89090909090911 2.31515151515153 2.80000000000002 3.34545454545456 3.95151515151516 4.61818181818182 5.34545454545455] |
| 148 | +</pre> |
| 149 | +<img src="images/demos/stats/output-7.png"/> |
| 150 | + |
| 151 | +<hr/> |
| 152 | +<pre> |
| 153 | +$y = pdl '[1 1 2 2 3 3 3 3 4 5 5 5]'; # suppose this is ratings for 12 apples |
| 154 | +$a = pdl '[1 2 3 1 2 3 1 2 3 1 2 3]'; # IV for types of apple |
| 155 | +@b = qw( y y y y y y n n n n n n ); # IV for whether we baked the apple |
| 156 | +</pre> |
| 157 | +First let's look at the raw data, categorised in each independent variable: |
| 158 | +<pre> |
| 159 | +$y->plot_stripchart( $a, \@b, { IVNM=>[qw(apple bake)], win=>$w } ); |
| 160 | +</pre> |
| 161 | +Looks like there's a visible partition in the "bake" IV |
| 162 | +<h4>Output</h4> |
| 163 | + |
| 164 | +<img src="images/demos/stats/output-8.png"/> |
| 165 | + |
| 166 | +<hr/> |
| 167 | +Let's try the analysis of variance (ANOVA) in <a href="https://metacpan.org/pod/PDL::Stats::GLM">PDL::Stats::GLM</a> |
| 168 | +<pre> |
| 169 | +%m = $y->anova( $a, \@b, { IVNM=>[qw(apple bake)], plot=>0, win=>$w } ); |
| 170 | +print "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m; |
| 171 | +</pre> |
| 172 | +The p-value of variance explained by "bake" is ~0.015 - significant |
| 173 | +Let's plot the means of the interaction of all IVs |
| 174 | +<pre> |
| 175 | +$m{'| apple ~ bake | m'}->plot_means($m{'| apple ~ bake | se'}, |
| 176 | + { IVNM=>[qw(apple bake)], plot=>1, win=>$w }); |
| 177 | +</pre> |
| 178 | +<h4>Output</h4> |
| 179 | +<pre> |
| 180 | +F 2.46666666666667 |
| 181 | +F_df [5 6] |
| 182 | +F_p 0.151168719948632 |
| 183 | +ms_model 3.08333333333333 |
| 184 | +ms_residual 1.25 |
| 185 | +ss_model 15.4166666666667 |
| 186 | +ss_residual 7.5 |
| 187 | +ss_total 22.9166666666667 |
| 188 | +| apple | F 0.466666666666667 |
| 189 | +| apple | F_p 0.648078345471096 |
| 190 | +| apple | df 2 |
| 191 | +| apple | m [2.75 3 3.5] |
| 192 | +| apple | ms 0.583333333333334 |
| 193 | +| apple | se [0.853912563829966 0.816496580927726 0.645497224367903] |
| 194 | +| apple | ss 1.16666666666667 |
| 195 | +| apple || err df 6 |
| 196 | +| apple ~ bake | F 0.0666666666666671 |
| 197 | +| apple ~ bake | F_p 0.936190104380701 |
| 198 | +| apple ~ bake | df 2 |
| 199 | +| apple ~ bake | m [ |
| 200 | + [1.5 2 2.5] |
| 201 | + [ 4 4 4.5] |
| 202 | +] |
| 203 | +| apple ~ bake | ms 0.0833333333333339 |
| 204 | +| apple ~ bake | se [ |
| 205 | + [0.5 1 0.5] |
| 206 | + [ 1 1 0.5] |
| 207 | +] |
| 208 | +| apple ~ bake | ss 0.166666666666668 |
| 209 | +| apple ~ bake || err df 6 |
| 210 | +| bake | F 11.2666666666667 |
| 211 | +| bake | F_p 0.015294126084452 |
| 212 | +| bake | df 1 |
| 213 | +| bake | m [2 4.16666666666667] |
| 214 | +| bake | ms 14.0833333333333 |
| 215 | +| bake | se [0.365148371670111 0.401386485959742] |
| 216 | +| bake | ss 14.0833333333333 |
| 217 | +| bake || err df 6 |
| 218 | +</pre> |
| 219 | +<img src="images/demos/stats/output-9.png"/> |
| 220 | + |
| 221 | +<hr/> |
| 222 | +This concludes the demo. |
| 223 | +<p/> |
| 224 | +Be sure to check the documentation for <a href="https://metacpan.org/pod/PDL::Stats">PDL::Stats</a>, to see further |
| 225 | +possibilities. |
| 226 | +</body> |
| 227 | +</html> |
0 commit comments