Distributed statistics

Whether threading an application or using distributed means, computing basics statistics becomes a bit more challenging. In the post (which I’ll update), I’ll break-down basic statistical functions for implementation in a distributed environment.

Contents:

Count

Count is simply the sum of the counts over all machines.
Single machine: \(O\left(n\right)\)
\begin{equation}
\sum_{i=1}^{n}1=n\tag{1}
\end{equation}
Solve with \(m\) machines: \(O\left(\max\left(n_{1},n_{2},…,n_{m}\right)+m\right)\)
\begin{equation}
n=\sum_{i=1}^{n_{1}}1+\sum_{i=1}^{n_{2}}1+\cdots+\sum_{i=1}^{n_{m}}1=n_{1}+n_{2}+\cdots+n_{m}\tag{2}
\end{equation}

Maximum

Maximum is simply the maximum of the maximums.
Single machine: \(O\left(n\right)\)
\begin{equation}
\max_{i=1,…,n}\left(x_{i}\right)\tag{3}
\end{equation}
Solve with \(m\) machines: \(O\left(\max\left(n_{1},n_{2},…,n_{m}\right)+m\right)\)
\begin{equation}
\max_{i=1,…,n}\left(x_{i}\right)=\max\left(\max_{i=1,…,n_{1}}\left(x_{i}\right),\max_{i=1,…,n_{2}}\left(x_{i}\right),…,\max_{i=1,…,n_{m}}\left(x_{i}\right)\right)\tag{4}
\end{equation}

Minimum

Minimum is simply the minimum of the minimums.
Single machine: \(O\left(n\right)\)
\begin{equation}
\min_{i=1,…,n}\left(x_{i}\right)\tag{5}
\end{equation}
Solve with \(m\) machines: \(O\left(\max\left(n_{1},n_{2},…,n_{m}\right)+m\right)\)
\begin{equation}
\min_{i=1,…,n}\left(x_{i}\right)=\min\left(\min_{i=1,…,n_{1}}\left(x_{i}\right),\min_{i=1,…,n_{2}}\left(x_{i}\right),…,\min_{i=1,…,n_{m}}\left(x_{i}\right)\right)\tag{6}
\end{equation}

Sum

Sum is simply the sum of the sums.
Single machine: \(O\left(n\right)\)
\begin{equation}
\sum_{i=1}^{n}x_{i}\tag{7}
\end{equation}
Solve with \(m\) machines: \(O\left(\max\left(n_{1},n_{2},…,n_{m}\right)+m\right)\)
\begin{equation}
\sum_{i=1}^{n}x_{i}=\sum_{i=1}^{n_{1}}x_{i}+\sum_{i=1}^{n_{2}}x_{i}+\cdots+\sum_{i=1}^{n_{m}}x_{i}\tag{8}
\end{equation}

Average

Average requires two elements: count and sum.
Single machine: \(O\left(n\right)\)
\begin{equation}
\sum_{i=1}^{n}\frac{x_{i}}{n}=\frac{\sum_{i=1}^{n}x_{i}}{n}\tag{9}
\end{equation}
Solve with \(m\) machines: \(O\left(\max\left(n_{1},n_{2},…,n_{m}\right)+m\right)\)
\begin{equation}
n=\sum_{i=1}^{n_{1}}1+\sum_{i=1}^{n_{2}}1+\cdots+\sum_{i=1}^{n_{m}}1=n_{1}+n_{2}+\cdots+n_{m}\label{eq:1avg}\tag{10}
\end{equation}
\begin{equation}
\sum_{i=1}^{n}x_{i}=\sum_{i=1}^{n_{1}}x_{i}+\sum_{i=1}^{n_{2}}x_{i}+\cdots+\sum_{i=1}^{n_{m}}x_{i}\label{eq:2avg}\tag{11}
\end{equation}
\begin{equation}
\frac{\sum_{i=1}^{n}x_{i}}{n}=\frac{(\ref{eq:2avg})}{(\ref{eq:1avg})}\tag{12}
\end{equation}

Variance

Variance requires three pieces: count, sum, and sum squared. Variance here is population-based. Multiply by \(\frac{n}{n-1}\) for sample.
Single machine: \(O\left(n\right)\)
\begin{equation}
\frac{\sum_{i=1}^{n}\left(x_{i}-\bar{x}\right)^{2}}{n}=\frac{\sum_{i=1}^{n}\left(x_{i}\right)^{2}}{n}-\left(\frac{\sum_{i=1}^{n}x_{i}}{n}\right)^{2}\tag{13}
\end{equation}
Proof:
\begin{eqnarray*}
\frac{\sum_{i=1}^{n}\left(x_{i}-\bar{x}\right)^{2}}{n} & = & \sum_{i=1}^{n}\frac{x_{i}^{2}-2x_{i}\bar{x}+\bar{x}^{2}}{n}\\
& = & \sum_{i=1}^{n}\frac{x_{i}^{2}}{n}-2\bar{x}\sum_{i=1}^{n}\frac{x_{i}}{n}+\bar{x}^{2}\sum_{i=1}^{n}\frac{1}{n}\\
& = & \sum_{i=1}^{n}\frac{x_{i}^{2}}{n}-2\bar{x}^{2}+\bar{x}^{2}\\
& = & \sum_{i=1}^{n}\frac{x_{i}^{2}}{n}-\bar{x}^{2}\\
& = & \sum_{i=1}^{n}\frac{x_{i}^{2}}{n}-\left(\sum_{i=1}^{n}\frac{x_{i}}{n}\right)^{2}
\end{eqnarray*}
Solve with \(m\) machines: \(O\left(\max\left(n_{1},n_{2},…,n_{m}\right)+m\right)\)
\begin{equation}
n=\sum_{i=1}^{n_{1}}1+\sum_{i=1}^{n_{2}}1+\cdots+\sum_{i=1}^{n_{m}}1=n_{1}+n_{2}+\cdots+n_{m}\label{eq:1var}\tag{14}
\end{equation}
\begin{equation}
\sum_{i=1}^{n}x_{i}=\sum_{i=1}^{n_{1}}x_{i}+\sum_{i=1}^{n_{2}}x_{i}+\cdots+\sum_{i=1}^{n_{m}}x_{i}\label{eq:2var}\tag{15}
\end{equation}
\begin{equation}
\sum_{i=1}^{n}x_{i}^{2}=\sum_{i=1}^{n_{1}}x_{i}^{2}+\sum_{i=1}^{n_{2}}x_{i}^{2}+\cdots+\sum_{i=1}^{n_{m}}x_{i}^{2}\label{eq:3var}\tag{16}
\end{equation}
\begin{equation}
\frac{\sum_{i=1}^{n}\left(x_{i}-\bar{x}\right)^{2}}{n}=\frac{\sum_{i=1}^{n}x_{i}^{2}}{n}-\left(\frac{\sum_{i=1}^{n}x_{i}}{n}\right)^{2}\\
=\frac{(\ref{eq:3var})}{(\ref{eq:1var})}-\left(\frac{(\ref{eq:2var})}{(\ref{eq:1var})}\right)^{2}=\frac{(\ref{eq:3var})(\ref{eq:1var})-(\ref{eq:2var})^{2}}{(\ref{eq:1var})^{2}}\tag{17}
\end{equation}

Covariance

Covariance requires four pieces: count, sum of \(x\), sum of \(y\), and the sum product of \(xy\). Covariance here is population-based. Multiply by \(\frac{n}{n-1}\) for sample.
Single machine: \(O\left(n\right)\)
\begin{equation}
cov\left(x,y\right)=\frac{\sum_{i=1}^{n}\left(x_{i}-\bar{x}\right)\left(y_{i}-\bar{y}\right)}{n}=\frac{\sum_{i=1}^{n}x_{i}y_{i}}{n}-\frac{\sum_{i=1}^{n}x_{i}\sum_{i=1}^{n}y_{i}}{n^{2}}\tag{18}
\end{equation}
Proof:
\begin{eqnarray*}
\frac{\sum_{i=1}^{n}\left(x_{i}-\bar{x}\right)\left(y_{i}-\bar{y}\right)}{n} & = & \frac{\sum_{i=1}^{n}x_{i}y_{i}-x_{i}\bar{y}-\bar{x}y_{i}+\bar{x}\bar{y}}{n}\\
& = & \frac{\sum_{i=1}^{n}x_{i}y_{i}}{n}-\bar{y}\frac{\sum_{i=1}^{n}x_{i}}{n}-\bar{x}\frac{\sum_{i=1}^{n}y_{i}}{n}+\bar{x}\bar{y}\frac{\sum_{i=1}^{n}1}{n}\\
& = & \frac{\sum_{i=1}^{n}x_{i}y_{i}}{n}-\bar{x}\bar{y}\\
& = & \frac{\sum_{i=1}^{n}x_{i}y_{i}}{n}-\left(\frac{\sum_{i=1}^{n}x_{i}}{n}\right)\left(\frac{\sum_{i=1}^{n}y_{i}}{n}\right)\\
& = & \frac{\sum_{i=1}^{n}x_{i}y_{i}}{n}-\frac{\sum_{i=1}^{n}x_{i}\sum_{i=1}^{n}y_{i}}{n^{2}}
\end{eqnarray*}
Solve with \(m\) machines: \(O\left(\max\left(n_{1},n_{2},…,n_{m}\right)+m\right)\)
\begin{equation}
n=\sum_{i=1}^{n_{1}}1+\sum_{i=1}^{n_{2}}1+\cdots+\sum_{i=1}^{n_{m}}1=n_{1}+n_{2}+\cdots+n_{m}\label{eq:1covar}\tag{19}
\end{equation}
\begin{equation}
\sum_{i=1}^{n}x_{i}=\sum_{i=1}^{n_{1}}x_{i}+\sum_{i=1}^{n_{2}}x_{i}+\cdots+\sum_{i=1}^{n_{m}}x_{i}\label{eq:2covar}\tag{20}
\end{equation}
\begin{equation}
\sum_{i=1}^{n}y_{i}=\sum_{i=1}^{n_{1}}y_{i}+\sum_{i=1}^{n_{2}}y_{i}+\cdots+\sum_{i=1}^{n_{m}}y_{i}\label{eq:3covar}\tag{21}
\end{equation}
\begin{equation}
\sum_{i=1}^{n}x_{i}y_{i}=\sum_{i=1}^{n_{1}}x_{i}y_{i}+\sum_{i=1}^{n_{2}}x_{i}y_{i}+\cdots+\sum_{i=1}^{n_{m}}x_{i}y_{i}\label{eq:4covar}\tag{22}
\end{equation}
\begin{equation}
\frac{\sum_{i=1}^{n}x_{i}y_{i}}{n}-\frac{\sum_{i=1}^{n}x_{i}\sum_{i=1}^{n}y_{i}}{n^{2}}=\frac{(\ref{eq:4covar})}{(\ref{eq:1covar})}-\frac{(\ref{eq:2covar})(\ref{eq:3covar})}{(\ref{eq:1covar})^{2}}\tag{23}
\end{equation}

Pearson Correlation Coefficient

The Pearson Correlation Coefficient requires six pieces: count, sum of \(x\), sum of \(y\), sum of \(x^{2}\), sum of \(y^{2}\), and sum product of \(xy\).
Single machine: \(O\left(n\right)\)
\begin{equation}
r_{xy}=\frac{\sum_{i=1}^{n}x_{i}y_{i}-\left(\frac{\sum_{i=1}^{n}x_{i}\sum_{i=1}^{n}y_{i}}{n}\right)}{\sqrt{\left(\sum_{i=1}^{n}x_{i}^{2}-\frac{\left(\sum_{i=1}^{n}x_{i}\right)^{2}}{n}\right)\left(\sum_{i=1}^{n}y_{i}^{2}-\frac{\left(\sum_{i=1}^{n}y_{i}\right)^{2}}{n}\right)}}=\frac{cov\left(x,y\right)}{\sigma_{x}\sigma_{y}}\tag{24}
\end{equation}
Solve with \(m\) machines: \(O\left(\max\left(n_{1},n_{2},…,n_{m}\right)+m\right)\)
\begin{equation}
n=\sum_{i=1}^{n_{1}}1+\sum_{i=1}^{n_{2}}1+\cdots+\sum_{i=1}^{n_{m}}1=n_{1}+n_{2}+\cdots+n_{m}\label{eq:1}\tag{25}
\end{equation}
\begin{equation}
\sum_{i=1}^{n}x_{i}=\sum_{i=1}^{n_{1}}x_{i}+\sum_{i=1}^{n_{2}}x_{i}+\cdots+\sum_{i=1}^{n_{m}}x_{i}\label{eq:2}\tag{26}
\end{equation}
\begin{equation}
\sum_{i=1}^{n}y_{i}=\sum_{i=1}^{n_{1}}y_{i}+\sum_{i=1}^{n_{2}}y_{i}+\cdots+\sum_{i=1}^{n_{m}}y_{i}\label{eq:3}\tag{27}
\end{equation}
\begin{equation}
\sum_{i=1}^{n}x_{i}^{2}=\sum_{i=1}^{n_{1}}x_{i}^{2}+\sum_{i=1}^{n_{2}}x_{i}^{2}+\cdots+\sum_{i=1}^{n_{m}}x_{i}^{2}\label{eq:4}\tag{28}
\end{equation}
\begin{equation}
\sum_{i=1}^{n}y_{i}^{2}=\sum_{i=1}^{n_{1}}y_{i}^{2}+\sum_{i=1}^{n_{2}}y_{i}^{2}+\cdots+\sum_{i=1}^{n_{m}}y_{i}^{2}\label{eq:5}\tag{29}
\end{equation}
\begin{equation}
\sum_{i=1}^{n}x_{i}y_{i}=\sum_{i=1}^{n_{1}}x_{i}y_{i}+\sum_{i=1}^{n_{2}}x_{i}y_{i}+\cdots+\sum_{i=1}^{n_{m}}x_{i}y_{i}\label{eq:6}\tag{30}
\end{equation}
\begin{equation}
r_{xy}=\frac{(\ref{eq:6})-\left(\frac{(\ref{eq:2})(\ref{eq:3})}{(\ref{eq:1})}\right)}{\sqrt{\left((\ref{eq:4})-\frac{(\ref{eq:2})^{2}}{(\ref{eq:1})}\right)\left((\ref{eq:5})-\frac{(\ref{eq:3})^{2}}{(\ref{eq:1})}\right)}}\tag{31}
\end{equation}