KJB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
prob_stat.h
Go to the documentation of this file.
1 /* $Id: prob_stat.h 18278 2014-11-25 01:42:10Z ksimek $ */
2 /* =========================================================================== *
3  |
4  | Copyright (c) 1994-2010 by Kobus Barnard (author)
5  |
6  | Personal and educational use of this code is granted, provided that this
7  | header is kept intact, and that the authorship is not misrepresented, that
8  | its use is acknowledged in publications, and relevant papers are cited.
9  |
10  | For other use contact the author (kobus AT cs DOT arizona DOT edu).
11  |
12  | Please note that the code in this file has not necessarily been adequately
13  | tested. Naturally, there is no guarantee of performance, support, or fitness
14  | for any particular task. Nonetheless, I am interested in hearing about
15  | problems that you encounter.
16  |
17  | Author: Kyle Simek, Ernesto Brau
18  * =========================================================================== */
19 
20 
21 #ifndef KJB_PROB_STAT
22 #define KJB_PROB_STAT
23 
27 #include "l_cpp/l_exception.h"
28 #include <boost/concept_check.hpp>
29 #include <boost/function.hpp>
30 #include <iterator>
31 #include <numeric>
32 #include <algorithm>
33 #include <functional>
34 #include <vector>
35 #include <iterator>
38 #include "prob_cpp/prob_pdf.h"
39 
40 namespace kjb
41 {
42 
53  template <class Iterator, class Value_type>
54 //typename std::iterator_traits<Iterator>::value_type
55 Value_type
56 mean(Iterator begin, Iterator end, const Value_type& /* dummy */)
57 {
58  using namespace boost;
59  BOOST_CONCEPT_ASSERT((DefaultConstructible<Value_type>));
60  BOOST_CONCEPT_ASSERT((InputIterator<Iterator>));
61 
62 // typedef typename std::iterator_traits<Iterator>::value_type value_type;
63  typedef typename std::iterator_traits<Iterator>::difference_type difference_type;
64  if(begin == end)
65  {
66  KJB_THROW_2(Illegal_argument, "Can't take mean of collection of zero size.");
67  }
68 
69  difference_type size = std::distance(begin, end);
70 
71  const Value_type& first = *begin;
72  std::advance(begin, 1);
73  Value_type sum = std::accumulate(begin, end, first);
74 
75  return sum /= size;
76 }
77 
89 template <class Iterator>
90 typename std::iterator_traits<Iterator>::value_type
91 mean(Iterator begin, Iterator end = Iterator())
92 {
93  using namespace boost;
94 
95  //typedef typename std::iterator_traits<Iterator>::value_type Value_type;
96 
97  BOOST_CONCEPT_ASSERT((InputIterator<Iterator>));
98 
99  return mean(begin, end, *begin);
100 }
101 
102 /* ============================================================================ *
103  * *
104  * STATISTICAL TESTING STUFF *
105  * *
106  * ============================================================================ */
107 
108 /* template<class Iterator>
109 struct Compute_statistic
110 {
111 }; */
112 
130 template<class InputIterator, class Distribution, class Statistic>
132 (
133  InputIterator first,
134  InputIterator last,
135  const Distribution& dist,
136  const Statistic& statistic,
137  //const boost::function3<double, InputIterator, InputIterator, InputIterator>& statistic,
138  double alpha,
139  int num_params,
140  int num_bins
141 )
142 {
143  if(num_params + 1 >= num_bins)
144  {
145  KJB_THROW_2(Runtime_error, "Not enough bins to perform goodness-of-fit test.");
146  }
147 
148  if(alpha <= 0 || alpha >= 1)
149  {
150  KJB_THROW_2(Runtime_error, "The p-value must be in (0,1).");
151  }
152 
153  int N = std::distance(first, last);
154  if(N < 10)
155  {
156  KJB_THROW_2(Runtime_error, "Not enough data to perform test.");
157  }
158 
159  Histogram h(first, last, num_bins);
160  std::vector<int> expected;
161  std::vector<int> observed;
162 
163  for(std::map<double, int>::const_iterator pair_p = h.as_map().begin(); pair_p != h.as_map().end(); pair_p++)
164  {
165  if(pair_p->second < 0)
166  {
167  KJB_THROW_2(Runtime_error, "Histogram values cannot negative.");
168  }
169 
170  if(pair_p->second > 0)
171  {
172  double low_lim = pair_p->first;
173  double up_lim = low_lim + h.bin_size();
174  int val = round(N * (cdf(dist, up_lim) - cdf(dist, low_lim)));
175  expected.push_back(val + (val == 0 ? 1 : 0));
176  observed.push_back(pair_p->second);
177  }
178  }
179 
180  int num_ne_bins = expected.size();
181  int dof = num_ne_bins - (num_params + 1);
182  double X = statistic(observed.begin(), observed.end(), expected.begin());
183 
184  // william's correction
185  double q = 1 + ((num_ne_bins + 1) / (6 * N));
186  double X_alpha = quantile(Chi_square_distribution(dof), 1 - alpha);
187 
188  return !((X / q) > X_alpha);
189 }
190 
204 template<class InputIterator, class Distribution, class Statistic>
206 (
207  InputIterator first,
208  InputIterator last,
209  const Distribution& dist,
210  const Statistic& statistic,
211  double alpha,
212  int dof_redux
213 )
214 {
215  if(alpha <= 0 || alpha >= 1)
216  {
217  KJB_THROW_2(Runtime_error, "The p-value must be in (0,1).");
218  }
219 
220  int N = std::distance(first, last);
221  if(N < 10)
222  {
223  KJB_THROW_2(Runtime_error, "Not enough data to perform test.");
224  }
225 
226  Histogram h(first, last, -1);
227  std::vector<int> expected;
228  std::vector<int> observed;
229 
230  for(std::map<double, int>::const_iterator pair_p = h.as_map().begin(); pair_p != h.as_map().end(); pair_p++)
231  {
232  if(pair_p->second < 0)
233  {
234  KJB_THROW_2(Runtime_error, "Histogram values cannot negative.");
235  }
236 
237  if(pair_p->second > 0)
238  {
239  int val = round(N * pdf(dist, pair_p->first));
240  expected.push_back(val + (val == 0 ? 1 : 0));
241  observed.push_back(pair_p->second);
242  }
243  }
244 
245  double X = statistic(observed.begin(), observed.end(), expected.begin());
246  double X_alpha = quantile(Chi_square_distribution(observed.size() - dof_redux), 1 - alpha);
247 
248  return !(X > X_alpha);
249 }
250 
254 inline
255 double chi2stat_helper(int O, int E)
256 {
257  // With yates' correction
258  return ((fabs(O - E) - 0.5) * (fabs(O - E) - 0.5)) / E;
259 }
260 
267 template<class InputIterator>
268 inline
269 double chi_square_statistic(InputIterator first1, InputIterator last1, InputIterator first2)
270 {
271  std::vector<double> sum_elems(std::distance(first1, last1));
272  std::transform(first1, last1, first2, sum_elems.begin(), std::ptr_fun(chi2stat_helper));
273 
274  return std::accumulate(sum_elems.begin(), sum_elems.end(), 0.0);
275 }
276 
280 inline
281 double gstat_helper(int O, int E)
282 {
283  // Yates' correction
284  double O_i = O;
285  if(O > E)
286  {
287  O_i -= 0.5;
288  }
289  else if(E > O)
290  {
291  O_i += 0.5;
292  }
293 
294  return O_i * std::log(O_i / E);
295 }
296 
303 template<class InputIterator>
304 inline
305 double g_statistic(InputIterator first1, InputIterator last1, InputIterator first2)
306 {
307  std::vector<double> sum_elems(std::distance(first1, last1));
308  std::transform(first1, last1, first2, sum_elems.begin(), std::ptr_fun(gstat_helper));
309 
310  return 2 * std::accumulate(sum_elems.begin(), sum_elems.end(), 0.0);
311 }
312 
324 template<class InputIterator, class Distribution>
325 inline
326 bool chi_square_test
327 (
328  InputIterator first,
329  InputIterator last,
330  const Distribution& dist,
331  double alpha,
332  int num_params,
333  int num_bins
334 )
335 {
336  //boost::function3<double, InputIterator, InputIterator, InputIterator> stat = &chi_square_statistic;
337  typedef std::vector<int>::const_iterator IntIterator;
338  boost::function3<double, IntIterator, IntIterator, IntIterator> stat = &chi_square_statistic<IntIterator>;
339  return goodness_of_fit_test(first, last, dist, stat, alpha, num_params, num_bins);
340 }
341 
352 template<class InputIterator, class Distribution>
353 bool chi_square_test
354 (
355  InputIterator first,
356  InputIterator last,
357  const Distribution& dist,
358  double alpha,
359  int dof_redux
360 )
361 {
362  //boost::function3<double, InputIterator, InputIterator, InputIterator> stat = &chi_square_statistic;
363  typedef std::vector<int>::const_iterator IntIterator;
364  boost::function3<double, IntIterator, IntIterator, IntIterator> stat = &chi_square_statistic<IntIterator>;
365  return goodness_of_fit_test(first, last, dist, stat, alpha, dof_redux);
366 }
367 
379 template<class InputIterator, class Distribution>
380 inline
381 bool g_test
382 (
383  InputIterator first,
384  InputIterator last,
385  const Distribution& dist,
386  double alpha,
387  int num_params,
388  int num_bins
389 )
390 {
391  typedef std::vector<int>::const_iterator IntIterator;
392  boost::function3<double, IntIterator, IntIterator, IntIterator> stat = &g_statistic<IntIterator>;
393  return goodness_of_fit_test(first, last, dist, stat, alpha, num_params, num_bins);
394 }
395 
406 template<class InputIterator, class Distribution>
407 bool g_test
408 (
409  InputIterator first,
410  InputIterator last,
411  const Distribution& dist,
412  double alpha,
413  int dof_redux
414 )
415 {
416  typedef std::vector<int>::const_iterator IntIterator;
417  boost::function3<double, IntIterator, IntIterator, IntIterator> stat = &g_statistic<IntIterator>;
418  return goodness_of_fit_test(first, last, dist, stat, alpha, dof_redux);
419 }
420 
421 } // namespace kjb
422 
423 #endif
boost::math::chi_squared Chi_square_distribution
Definition: prob_distribution.h:63
Definition of various standard probability distributions.
double accumulate(const Matrix_d< R, C, T > &mat, double init)
Definition: m_matrix_d.impl.h:432
double chi2stat_helper(int O, int E)
Compute the chi-square test value for a single element.
Definition: prob_stat.h:255
bool goodness_of_fit_test(InputIterator first, InputIterator last, const Distribution &dist, const Statistic &statistic, double alpha, int num_params, int num_bins)
Definition: prob_stat.h:132
double bin_size() const
Return the bin size of this histogram.
Definition: prob_histogram.h:83
double pdf(const MV_gaussian_distribution &P, const Vector &x)
Computes the joint PDF of a multivariate Gaussian at x.
Definition: prob_pdf.cpp:32
bool g_test(InputIterator first, InputIterator last, const Distribution &dist, double alpha, int num_params, int num_bins)
Definition: prob_stat.h:382
double chi_square_statistic(InputIterator first1, InputIterator last1, InputIterator first2)
Definition: prob_stat.h:269
Value_type mean(Iterator begin, Iterator end, const Value_type &)
Definition: prob_stat.h:56
double dist(const pt a, const pt b)
compute approx. Great Circle distance between two UTM points
Definition: layer.cpp:45
PDFs and CDFs for the different distributions defined in "prob_distribution.h".
const std::map< double, int > & as_map() const
Return a vector of the bins, represented by their lower bounds.
Definition: prob_histogram.h:92
double g_statistic(InputIterator first1, InputIterator last1, InputIterator first2)
Definition: prob_stat.h:305
#define KJB_THROW_2(ex, msg)
Definition: l_exception.h:48
sum(zmx.*zmy) sum(zmy.^2)]
bool chi_square_test(InputIterator first, InputIterator last, const Distribution &dist, double alpha, int num_params, int num_bins)
Definition: prob_stat.h:327
A class that represents a histogram of data. ATM, the data must be doubles.
Definition: prob_histogram.h:47
double cdf(const Log_normal_distribution &dist, double x)
Computes the cdf of a log-normal distribution at x.
Definition: prob_pdf.h:151
Object thrown when an argument to a function is not acceptable.
Definition: l_exception.h:377
Support for error handling exception classes in libKJB.
double gstat_helper(int O, int E)
Compute the G-test value for a single element.
Definition: prob_stat.h:281
Object thrown when computation fails somehow during execution.
Definition: l_exception.h:321