module Statsample::Bivariate

Diverse methods and classes to calculate bivariate relations Specific classes:

Diverse methods and classes to calculate bivariate relations Specific classes:

Public Instance Methods

correlation_matrix(ds) click to toggle source

Correlation matrix. Order of rows and columns depends on Statsample::Dataset#fields order

# File lib/statsample/bivariate.rb, line 191
def correlation_matrix(ds)
  vars,cases=ds.fields.size,ds.cases
  if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
    cm=correlation_matrix_optimized(ds)
  else
    cm=correlation_matrix_pairwise(ds)
  end
  cm.extend(Statsample::CovariateMatrix)
  cm.fields=ds.fields
  cm
end
correlation_matrix_optimized(ds) click to toggle source
# File lib/statsample/bivariate.rb, line 203
def correlation_matrix_optimized(ds)
  s=covariance_matrix_optimized(ds)
  sds=GSL::Matrix.diagonal(s.diagonal.sqrt.pow(-1))
  cm=sds*s*sds
  # Fix diagonal
  s.row_size.times {|i|
    cm[i,i]=1.0
  }
  cm
end
correlation_matrix_pairwise(ds) click to toggle source
# File lib/statsample/bivariate.rb, line 213
def correlation_matrix_pairwise(ds)
  cache={}
  cm=ds.collect_matrix do |row,col|
    if row==col
      1.0
    elsif (ds[row].type!=:scale or ds[col].type!=:scale)
      nil
    else
      if cache[[col,row]].nil?
        r=pearson(ds[row],ds[col])
        cache[[row,col]]=r
        r
      else
        cache[[col,row]]
      end 
    end
  end
end
correlation_probability_matrix(ds, tails=:both) click to toggle source

Matrix of correlation probabilities. Order of rows and columns depends on Statsample::Dataset#fields order

# File lib/statsample/bivariate.rb, line 247
def correlation_probability_matrix(ds, tails=:both)
  rows=ds.fields.collect do |row|
    ds.fields.collect do |col|
      v1a,v2a=Statsample.only_valid_clone(ds[row],ds[col])
      (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
    end
  end
  Matrix.rows(rows)
end
covariance(v1,v2) click to toggle source

Covariance between two vectors

# File lib/statsample/bivariate.rb, line 13
def covariance(v1,v2)
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
  return nil if v1a.size==0
  if Statsample.has_gsl?
    GSL::Stats::covariance(v1a.gsl, v2a.gsl)
  else
    covariance_slow(v1a,v2a)
  end
end
covariance_matrix(ds) click to toggle source

Covariance matrix. Order of rows and columns depends on Statsample::Dataset#fields order

# File lib/statsample/bivariate.rb, line 155
def covariance_matrix(ds)
  vars,cases=ds.fields.size,ds.cases
  if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
    cm=covariance_matrix_optimized(ds)
  else
    cm=covariance_matrix_pairwise(ds)
    
  end
  cm.extend(Statsample::CovariateMatrix)
  cm.fields=ds.fields
  cm
end
covariance_matrix_optimized(ds) click to toggle source
# File lib/statsample/bivariate.rb, line 141
def covariance_matrix_optimized(ds)
  x=ds.to_gsl
  n=x.row_size
  m=x.column_size
  means=((1/n.to_f)*GSL::Matrix.ones(1,n)*x).row(0)
  centered=x-(GSL::Matrix.ones(n,m)*GSL::Matrix.diag(means))
  ss=centered.transpose*centered
  s=((1/(n-1).to_f))*ss
  s
end
covariance_matrix_pairwise(ds) click to toggle source
# File lib/statsample/bivariate.rb, line 169
def covariance_matrix_pairwise(ds)
  cache={}
  matrix=ds.collect_matrix do |row,col|
    if (ds[row].type!=:scale or ds[col].type!=:scale)
      nil
    elsif row==col
      ds[row].variance
    else
      if cache[[col,row]].nil?
        cov=covariance(ds[row],ds[col])
        cache[[row,col]]=cov
        cov
      else
         cache[[col,row]]
      end
    end
  end
  matrix
end
gamma(matrix) click to toggle source

Calculates Goodman and Kruskal's gamma.

Gamma is the surplus of concordant pairs over discordant pairs, as a percentage of all pairs ignoring ties.

Source: faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm

# File lib/statsample/bivariate.rb, line 305
def gamma(matrix)
  v=pairs(matrix)
  (v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
end
maximum_likehood_dichotomic(pred,real) click to toggle source

Estimate the ML between two dichotomic vectors

# File lib/statsample/bivariate.rb, line 23
def maximum_likehood_dichotomic(pred,real)
  preda,reala=Statsample.only_valid_clone(pred,real)                
  sum=0
  preda.each_index{|i|
     sum+=(reala[i]*Math::log(preda[i])) + ((1-reala[i])*Math::log(1-preda[i]))
  }
  sum
end
min_n_valid(ds) click to toggle source

based on a dataset

# File lib/statsample/bivariate.rb, line 373
def min_n_valid(ds)
  min=ds.cases
  m=n_valid_matrix(ds)
  for x in 0...m.row_size
    for y in 0...m.column_size
      min=m[x,y] if m[x,y] < min
    end
  end
  min
end
n_valid_matrix(ds) click to toggle source

Retrieves the n valid pairwise.

# File lib/statsample/bivariate.rb, line 233
def n_valid_matrix(ds)
  ds.collect_matrix do |row,col|
    if row==col
      ds[row].valid_data.size
    else
      rowa,rowb=Statsample.only_valid_clone(ds[row],ds[col])
      rowa.size
    end
  end
end
ordered_pairs(vector) click to toggle source
# File lib/statsample/bivariate.rb, line 351
def ordered_pairs(vector)
  d=vector.data
  a=[]
  (0...(d.size-1)).each{|i|
    ((i+1)...(d.size)).each {|j|
      a.push([d[i],d[j]])
    }
  }
  a
end
pairs(matrix) click to toggle source

Calculate indexes for a matrix the rows and cols has to be ordered

# File lib/statsample/bivariate.rb, line 310
def pairs(matrix)
  # calculate concordant #p matrix
  rs=matrix.row_size
  cs=matrix.column_size
  conc=disc=ties_x=ties_y=0
  (0...(rs-1)).each do |x|
    (0...(cs-1)).each do |y|
      ((x+1)...rs).each do |x2|
        ((y+1)...cs).each do |y2|
          # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
          conc+=matrix[x,y]*matrix[x2,y2]
        end
      end
    end
  end
  (0...(rs-1)).each {|x|
    (1...(cs)).each{|y|
      ((x+1)...rs).each{|x2|
        (0...y).each{|y2|
          # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
          disc+=matrix[x,y]*matrix[x2,y2]
        }
      }
    }
  }
  (0...(rs-1)).each {|x|
    (0...(cs)).each{|y|
      ((x+1)...(rs)).each{|x2|
        ties_x+=matrix[x,y]*matrix[x2,y]
      }
    }
  }
  (0...rs).each {|x|
    (0...(cs-1)).each{|y|
      ((y+1)...(cs)).each{|y2|
        ties_y+=matrix[x,y]*matrix[x,y2]
      }
    }
  }
  {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
end
partial_correlation(v1,v2,control) click to toggle source

Correlation between v1 and v2, controling the effect of control on both.

# File lib/statsample/bivariate.rb, line 132
def partial_correlation(v1,v2,control)
  v1a,v2a,cona=Statsample.only_valid_clone(v1,v2,control)
  rv1v2=pearson(v1a,v2a)
  rv1con=pearson(v1a,cona)
  rv2con=pearson(v2a,cona)        
  (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
  
end
pearson(v1,v2) click to toggle source

Calculate Pearson correlation coefficient ® between 2 vectors

# File lib/statsample/bivariate.rb, line 43
def pearson(v1,v2)
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
  return nil if v1a.size ==0
  if Statsample.has_gsl?
    GSL::Stats::correlation(v1a.gsl, v2a.gsl)
  else
    pearson_slow(v1a,v2a)
  end
end
point_biserial(dichotomous,continous) click to toggle source

Calculate Point biserial correlation. Equal to Pearson correlation, with one dichotomous value replaced by “0” and the other by “1”

# File lib/statsample/bivariate.rb, line 265
def point_biserial(dichotomous,continous)
  ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
  raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
  raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
  f0=ds['d'].factors.sort[0]
  m0=ds.filter_field('c') {|c| c['d']==f0}
  m1=ds.filter_field('c') {|c| c['d']!=f0}
  ((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
end
prediction_optimized(vars,cases) click to toggle source

Predicted time for optimized correlation matrix, in miliseconds See benchmarks/correlation_matrix.rb to see mode of calculation

# File lib/statsample/bivariate.rb, line 111
def prediction_optimized(vars,cases)
  ((4+0.018128*cases+0.246871*vars+0.001169*vars*cases)**2) / 100
end
prediction_pairwise(vars,cases) click to toggle source

Predicted time for pairwise correlation matrix, in miliseconds See benchmarks/correlation_matrix.rb to see mode of calculation

# File lib/statsample/bivariate.rb, line 105
def prediction_pairwise(vars,cases)
  ((-0.518111-0.000746*cases+1.235608*vars+0.000740*cases*vars)**2) / 100
end
prop_pearson(t, size, tails=:both) click to toggle source

Retrieves the probability value (a la SPSS) for a given t, size and number of tails. Uses a second parameter

  • :both or 2 : for r!=0 (default)

  • :right, :positive or 1 : for r > 0

  • :left, :negative : for r < 0

# File lib/statsample/bivariate.rb, line 83
def prop_pearson(t, size, tails=:both)
  tails=:both if tails==2
  tails=:right if tails==1 or tails==:positive
  tails=:left if tails==:negative
  
  n_tails=case tails
    when :both then 2
    else 1
  end
  t=-t if t>0 and (tails==:both)
  cdf=Distribution::T.cdf(t, size-2)
  if(tails==:right)
    1.0-(cdf*n_tails)
  else
    cdf*n_tails
  end
end
residuals(from,del) click to toggle source

Returns residual score after delete variance from another variable

# File lib/statsample/bivariate.rb, line 117
def residuals(from,del)
  r=Statsample::Bivariate.pearson(from,del)
  froms, dels = from.vector_standarized, del.vector_standarized
  nv=[]
  froms.data_with_nils.each_index do |i|
    if froms[i].nil? or dels[i].nil?
      nv.push(nil)
    else
      nv.push(froms[i]-r*dels[i])
    end
  end
  nv.to_vector(:scale)
end
spearman(v1,v2) click to toggle source

Spearman ranked correlation coefficient (rho) between 2 vectors

# File lib/statsample/bivariate.rb, line 258
def spearman(v1,v2)
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
  v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
  pearson(v1r,v2r)
end
sum_of_squares(v1,v2) click to toggle source
# File lib/statsample/bivariate.rb, line 36
def sum_of_squares(v1,v2)
  v1a,v2a=Statsample.only_valid_clone(v1,v2)        
  m1=v1a.mean
  m2=v2a.mean
  (v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
end
t_pearson(v1,v2) click to toggle source

Retrieves the value for t test for a pearson correlation between two vectors to test the null hipothesis of r=0

# File lib/statsample/bivariate.rb, line 61
def t_pearson(v1,v2)
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
  r=pearson(v1a,v2a)
  if(r==1.0) 
    0
  else
    t_r(r,v1a.size)
  end
end
t_r(r,size) click to toggle source

Retrieves the value for t test for a pearson correlation giving r and vector size Source : faculty.chass.ncsu.edu/garson/PA765/correl.htm

# File lib/statsample/bivariate.rb, line 73
def t_r(r,size)
  r * Math::sqrt(((size)-2).to_f / (1 - r**2))
end
tau_a(v1,v2) click to toggle source

Kendall Rank Correlation Coefficient (Tau a) Based on Hervé Adbi article

# File lib/statsample/bivariate.rb, line 276
def tau_a(v1,v2)
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
  n=v1.size
  v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
  o1=ordered_pairs(v1r)
  o2=ordered_pairs(v2r)
  delta= o1.size*2-(o2  & o1).size*2
  1-(delta * 2 / (n*(n-1)).to_f)
end
tau_b(matrix) click to toggle source

Calculates Goodman and Kruskal’s Tau b correlation. Tb is an asymmetric P-R-E measure of association for nominal scales (Mielke, X)

Tau-b defines perfect association as strict monotonicity. Although it requires strict monotonicity to reach 1.0, it does not penalize ties as much as some other measures.

Reference

Mielke, P. GOODMAN–KRUSKAL TAU AND GAMMA. Source: faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm

# File lib/statsample/bivariate.rb, line 295
def tau_b(matrix)
  v=pairs(matrix)
  ((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
end