Description of distance2curve

0001 function [xy,distance,t_a] = distance2curve(curvexy,mapxy,interpmethod)
0002 % distance2curve: minimum distance from a point to a general curvilinear n-dimensional arc
0003 % usage: [xy,distance,t] = distance2curve(curvexy,mapxy) % uses linear curve segments
0004 % usage: [xy,distance,t] = distance2curve(curvexy,mapxy,interpmethod)
0005 %
0006 % Identifies the closest point along a general space curve (a 1-d path
0007 % in some space) to some new set of points. The curve may be piecewise
0008 % linear or a parametric spline or pchip model.
0009 %
0010 % arguments: (input)
0011 %  curvexy - An nxp real numeric array containing the points of the
0012 %        curve. For 2-dimensional curves, p == 2. This will be a list
0013 %        of points (each row of the array is a new point) that
0014 %        define the curve. The curve may cross itself in space.
0015 %        Closed curves are acceptable, in which case the first
0016 %        and last points would be identical. (Sorry, but periodic
0017 %        end conditions are not an option for the spline at this time.)
0018 %
0019 %        Since a curve makes no sense in less than 2 dimensions,
0020 %        p >= 2 is required.
0021 %
0022 %  mapxy - an mxp real numeric array, where m is the number of new points
0023 %        to be mapped to the curve in term of their closest distance.
0024 %
0025 %        These points which will be mapped to the existing curve
0026 %        in terms of the minimium (euclidean, 2-norm) distance
0027 %        to the curve. Each row of this array will be a different
0028 %        point.
0029 %
0030 %  interpmethod - (OPTIONAL) string flag - denotes the method
0031 %        used to compute the arc length of the curve.
0032 %
0033 %        method may be any of 'linear', 'spline', or 'pchip',
0034 %        or any simple contraction thereof, such as 'lin',
0035 %        'sp', or even 'p'.
0036 %
0037 %        interpmethod == 'linear' --> Uses a linear chordal
0038 %               approximation to define the curve.
0039 %               This method is the most efficient.
0040 %
0041 %        interpmethod == 'pchip' --> Fits a parametric pchip
0042 %               approximation.
0043 %
0044 %        interpmethod == 'spline' --> Uses a parametric spline
0045 %               approximation to fit the curves. Generally for
0046 %               a smooth curve, this method may be most accurate.
0047 %
0048 %        DEFAULT: 'linear'
0049 %
0050 % arguments: (output)
0051 %  xy - an mxp array, contains the closest point identified along
0052 %       the curve to each of the points provided in mapxy.
0053 %
0054 %  distance - an mx1 vector, the actual distance to the curve,
0055 %       in terms minimum Euclidean distance.
0056 %
0057 %  t  - fractional arc length along the interpolating curve to that
0058 %       point. This is the same value that interparc would use to
0059 %       produce the points in xy.
0060 %
0061 %
0062 % Example:
0063 % % Find the closest points and the distance to a polygonal line from
0064 % % several test points.
0065 %
0066 % curvexy = [0 0;1 0;2 1;0 .5;0 0];
0067 % mapxy = [3 4;.5 .5;3 -1];
0068 % [xy,distance,t] = distance2curve(curvexy,mapxy,'linear')
0069 % % xy =
0070 % %                          2                         1
0071 % %          0.470588235294118         0.617647058823529
0072 % %                        1.5                       0.5
0073 % % distance =
0074 % %           3.16227766016838
0075 % %          0.121267812518166
0076 % %           2.12132034355964
0077 % % t =
0078 % %          0.485194315877587
0079 % %          0.802026225550702
0080 % %           0.34308419095021
0081 %
0082 %
0083 % plot(curvexy(:,1),curvexy(:,2),'k-o',mapxy(:,1),mapxy(:,2),'r*')
0084 % hold on
0085 % plot(xy(:,1),xy(:,2),'g*')
0086 % line([mapxy(:,1),xy(:,1)]',[mapxy(:,2),xy(:,2)]','color',[0 0 1])
0087 % axis equal
0088 %
0089 %
0090 % Example:
0091 % % Solve for the nearest point on the curve of a 3-d quasi-elliptical
0092 % % arc (sampled and interpolated from 20 points) mapping a set of points
0093 % % along a surrounding circle onto the ellipse. This is the example
0094 % % used to generate the screenshot figure.
0095 % t = linspace(0,2*pi,20)';
0096 % curvexy = [cos(t) - 1,3*sin(t) + cos(t) - 1.25,(t/2 + cos(t)).*sin(t)];
0097 %
0098 % s = linspace(0,2*pi,100)';
0099 % mapxy = 5*[cos(s),sin(s),sin(s)];
0100 % xy = distance2curve(curvexy,mapxy,'spline');
0101 %
0102 % plot3(curvexy(:,1),curvexy(:,2),curvexy(:,3),'ko')
0103 % line([mapxy(:,1),xy(:,1)]',[mapxy(:,2),xy(:,2)]',[mapxy(:,3),xy(:,3)]','color',[0 0 1])
0104 % axis equal
0105 % axis square
0106 % box on
0107 % grid on
0108 % view(26,-6)
0109 %
0110 %
0111 % Example:
0112 % % distance2curve is fairly fast, at least for the linear case.
0113 % % Map 1e6 points onto a polygonal curve in 10 dimensions.
0114 % curvexy = cumsum(rand(10,10));
0115 % mapxy = rand(1000000,10)*5;
0116 % tic,[xy,distance] = distance2curve(curvexy,mapxy,'linear');toc
0117 % % Elapsed time is 2.867453 seconds.
0118 %
0119 %
0120 % See also: interparc, spline, pchip, interp1, arclength
0121 %
0122 % Author: John D'Errico
0123 % e-mail: woodchips@rochester.rr.com
0124 % Release: 1.0
0125 % Release date: 9/22/2010
0126 
0127 % check for errors, defaults, etc...
0128 if (nargin < 2)
0129   error('DISTANCE2CURVE:insufficientarguments', ...
0130     'at least curvexy and mapxy must be supplied')
0131 elseif nargin > 3
0132   error('DISTANCE2CURVE:abundantarguments', ...
0133     'Too many arguments were supplied')
0134 end
0135 
0136 % get the dimension of the space our points live in
0137 [n,p] = size(curvexy);
0138 
0139 if isempty(curvexy) || isempty(mapxy)
0140   % empty begets empty. you might say this was a pointless exercise.
0141   xy = zeros(0,p);
0142   distance = zeros(0,p);
0143   t_a = zeros(0,p);
0144   return
0145 end
0146 
0147 % do curvexy and mapxy live in the same space?
0148 if size(mapxy,2) ~= p
0149   error('DISTANCE2CURVE:improperpxorpy', ...
0150     'curvexy and mapxy do not appear to live in the same dimension spaces')
0151 end
0152 
0153 % do the points live in at least 2 dimensions?
0154 if p < 2
0155   error('DISTANCE2CURVE:improperpxorpy', ...
0156     'The points MUST live in at least 2 dimensions for any curve to be defined.')
0157 end
0158 
0159 % how many points to be mapped to the curve?
0160 m = size(mapxy,1);
0161 
0162 % make sure that curvexy and mapxy are doubles, as uint8, etc
0163 % would cause problems down the line.
0164 curvexy = double(curvexy);
0165 mapxy = double(mapxy);
0166 
0167 % test for complex inputs
0168 if ~isreal(curvexy) || ~isreal(mapxy)
0169   error('DISTANCE2CURVE:complexinputs','curvexy and mapxy may not be complex')
0170 end
0171 
0172 % default for interpmethod
0173 if (nargin < 3) || isempty(interpmethod)
0174   interpmethod = 'linear';
0175 elseif ~ischar(interpmethod)
0176   error('DISTANCE2CURVE:invalidinterpmethod', ...
0177     'Invalid method indicated. Only ''linear'',''pchip'',''spline'' allowed')
0178 else
0179   validmethods = {'linear' 'pchip' 'spline'};
0180   ind = strmatch(lower(interpmethod),validmethods);
0181   if isempty(ind) || (length(ind) > 1)
0182     error('DISTANCE2CURVE:invalidinterpmethod', ...
0183       'Invalid method indicated. Only ''linear'',''pchip'',''spline'' allowed')
0184   end
0185   interpmethod = validmethods{ind};
0186 end
0187 
0188 % if the curve is a single point, stop here
0189 if n == 1
0190   % return the appropriate parameters
0191   xy = repmat(curvexy,m,1);
0192   t_a = zeros(m,1);
0193   
0194   % 2 norm distance, or sqrt of sum of squares of differences
0195   distance = sqrt(sum(bsxfun(@minus,curvexy,mapxy).^2,2));
0196   
0197   % we can drop out here
0198   return
0199 end
0200 
0201 % compute the chordal linear arclengths, and scale to [0,1].
0202 seglen = sqrt(sum(diff(curvexy,[],1).^2,2));
0203 t0 = [0;cumsum(seglen)/sum(seglen)];
0204 
0205 % We need to build some parametric splines.
0206 % compute the splines, storing the polynomials in one 3-d array
0207 ppsegs = cell(1,p);
0208 % the breaks for the splines will be t0, unless spline got fancy
0209 % on us here.
0210 breaks = t0;
0211 for i = 1:p
0212   switch interpmethod
0213     case 'linear'
0214       dt = diff(t0);
0215       ind = 1:(n-1);
0216       ppsegs{i} = [(curvexy(ind + 1,i) - curvexy(ind,i))./dt,curvexy(ind,i)];
0217     case 'pchip'
0218       spl = pchip(t0,curvexy(:,i));
0219       ppsegs{i} = spl.coefs;
0220     case 'spline'
0221       spl = spline(t0,curvexy(:,i));
0222       breaks = spl.breaks';
0223       nc = numel(spl.coefs);
0224       if nc < 4
0225         % just pretend it has cubic segments
0226         spl.coefs = [zeros(1,4-nc),spl{i}.coefs];
0227         spl.order = 4;
0228       end
0229       ppsegs{i} = spl.coefs;
0230   end
0231 end
0232 
0233 % how many breaks did we find in the spline? This is
0234 % only a thing to worry about for a spline based on few points,
0235 % when the function spline.m may choose to use only two breaks.
0236 nbr = numel(breaks);
0237 
0238 % for each point in mapxy, find the closest point to those
0239 % in curvexy. This part we can do in a vectorized form.
0240 pointdistances = ipdm(mapxy,curvexy,'metric',2, ...
0241   'result','structure','subset','nearestneighbor');
0242 
0243 % initialize the return variables, using the closest point
0244 % found in the set curvexy.
0245 xy = curvexy(pointdistances.columnindex,:);
0246 distance = pointdistances.distance;
0247 t = t0(pointdistances.columnindex);
0248 
0249 % we must now do at least some looping, still vectorized where possible.
0250 % the piecewise linear case is simpler though, so do it separately.
0251 if strcmp(interpmethod,'linear');
0252   % loop over the individual points, vectorizing in the number of
0253   % segments, when there are many segments, but not many points to map.
0254   if n >= (5*m)
0255     % many segments, so loop over the points in mapxy
0256     for i = 1:m
0257       % the i'th point in mapxy
0258       xyi = mapxy(i,:);
0259       
0260       % Compute the location (in t) of the minimal distance
0261       % point to xyi, for all lines.
0262       tnum = zeros(nbr - 1,1);
0263       tden = tnum;
0264       for j = 1:p
0265         ppj = ppsegs{j};
0266         tden = tden + ppj(:,1).^2;
0267         tnum = tnum + ppj(:,1).*(xyi(j) - ppj(:,2));
0268       end
0269       tmin = tnum./tden;
0270       
0271       % toss out any element of tmin that is less than or equal to
0272       % zero, or or is greater than dt for that segment.
0273       tmin((tmin <= 0) | (tmin >= diff(t0))) = NaN;
0274       
0275       % for any segments with a valid minimum distance inside the
0276       % segment itself, compute that distance.
0277       dmin = zeros(nbr - 1,1);
0278       for j = 1:p
0279         ppi = ppsegs{j};
0280         dmin = dmin + (ppi(:,1).*tmin + ppi(:,2) - xyi(j)).^2;
0281       end
0282       dmin = sqrt(dmin);
0283       
0284       % what is the minimum distance among these segments?
0285       [mindist,minind] = min(dmin);
0286       
0287       if ~isnan(mindist) && (distance(i) > mindist)
0288         % there is a best segment, better than the
0289         % closest point from curvexy.
0290         distance(i) = mindist;
0291         t(i) = tmin(minind) + t0(minind);
0292         
0293         for j = 1:p
0294           ppj = ppsegs{j};
0295           xy(i,j) = ppj(minind,1).*tmin(minind) + ppj(minind,2);
0296         end
0297         
0298       end
0299     end
0300   else
0301     for i = 1:(n-1)
0302       % the i'th segment of the curve
0303       t1 = t0(i);
0304       t2 = t0(i+1);
0305       
0306       % Compute the location (in t) of the minimal distance
0307       % point to mapxy, for all points.
0308       tnum = zeros(m,1);
0309       tden = 0;
0310       for j = 1:p
0311         ppj = ppsegs{j};
0312         tden = tden + ppj(i,1).^2;
0313         tnum = tnum + ppj(i,1).*(mapxy(:,j) - ppj(i,2));
0314       end
0315       tmin = tnum./tden;
0316       
0317       % We only care about those points for this segment where there
0318       % is a minimal distance to the segment that is internal to the
0319       % segment.
0320       k = find((tmin > 0) & (tmin < (t2-t1)));
0321       nk = numel(k);
0322       
0323       if nk > 0
0324         % for any points with a valid minimum distance inside the
0325         % segment itself, compute that distance.
0326         dmin = zeros(nk,1);
0327         xymin = zeros(nk,p);
0328         for j = 1:p
0329           ppj = ppsegs{j};
0330           xymin(:,j) = ppj(i,1).*tmin(k) + ppj(i,2);
0331           dmin = dmin + (xymin(:,j) - mapxy(k,j)).^2;
0332         end
0333         dmin = sqrt(dmin);
0334         
0335         L = dmin < distance(k);
0336         % this segment has a closer point
0337         % closest point from curvexy.
0338         if any(L)
0339           distance(k(L)) = dmin(L);
0340           t(k(L)) = tmin(k(L)) + t0(i);
0341           xy(k(L),:) = xymin(L,:);
0342         end
0343       end
0344     end
0345   end
0346   
0347   % for the linear case, t is identical to the fractional arc length
0348   % along the curve.
0349   t_a = t;
0350   
0351 else
0352   % cubic segments. here it is simplest to loop over the
0353   % distinct curve segments. We need not test the endpoints
0354   % of the segments, since the call to ipdm did that part.
0355   xytrans = zeros(1,p);
0356   polydiff = @(dp) dp(1:6).*[6 5 4 3 2 1];
0357   for j = 1:(n-1)
0358     % the j'th curve segment
0359     t1 = t0(j);
0360     t2 = t0(j+1);
0361 
0362     % for a polynomial in t that looks like
0363     % P(t) = a1*t^3 + a2*t^2 + a3*t + a4, in each dimension,
0364     % extract the polynomial pieces for the 6th degree polynomial
0365     % in t for the square of the Euclidean distance to the curve.
0366     % Thus, (P_x(t) - x0)^2 + (P_y(t) - y0)^2 + ...
0367     %
0368     % a1^2*t^6
0369     % 2*a1*a2*t^5
0370     % (2*a1*a3 + a2^2)*t^4
0371     % (2*a2*a3 - 2*a1*x0 + 2*a1*a4)*t^3
0372     % (a3^2 - 2*a2*x0 + 2*a2*a4)*t^2
0373     % (-2*a3*x0 + 2*a3*a4)*t
0374     % x0^2 - 2*a4*x0 + a4^2
0375     %
0376     % here, only the parts of this distance that are independent of
0377     % the point itself are computed. so the x0 terms are not built
0378     % yet. All of the terms with a4 in them will go away because
0379     % of the translation.
0380     distpoly0 = zeros(1,7);
0381     for i = 1:p
0382       ppi = ppsegs{i};
0383       % this will allow us to translate each poly to pass through
0384       % (0,0) (i.e., at t = 0)
0385       xytrans(i) = ppi(j,4);
0386       distpoly0(1:2) = distpoly0(1:2) + ppi(j,1).*[ppi(j,1),2*ppi(j,2)];
0387       distpoly0(3) = distpoly0(3) + 2.*ppi(j,1).*ppi(j,3) + ppi(j,2).^2;
0388       distpoly0(4) = distpoly0(4) + 2.*ppi(j,2).*ppi(j,3);
0389       distpoly0(5) = distpoly0(5) + ppi(j,3).^2;
0390     end
0391 
0392     for i = 1:m
0393       % the i'th point, translated by xytrans. The translation does
0394       % not change the distance to this segment, but it does make
0395       % the computations more robust to numerical problems.
0396       xyi = mapxy(i,:) - xytrans;
0397       
0398       % update the poly for this particular point
0399       % (-2*a1*x0)*t^3
0400       % (-2*a2*x0)*t^2
0401       % (-2*a3*x0)*t
0402       % x0^2
0403       distpoly = distpoly0;
0404       for k = 1:p
0405         ppk = ppsegs{k};
0406         distpoly(4:6) = distpoly(4:6) - 2.*ppk(j,1:3).*xyi(k);
0407         distpoly(7) = distpoly(7) + xyi(k).^2;
0408       end
0409       
0410       % find any minima of this polynomial in the interval (0,t2-t1).
0411       % we can ignore solutions that happen at the endpoints of the
0412       % interval, since those are already covered by ipdm.
0413       %
0414       % merely compute the zeros of the derivative polynomial
0415       diffpoly = polydiff(distpoly);
0416       tstationary = roots(diffpoly);
0417       % discard any with an imaginary part, those that are less
0418       % than 0, or greater than t2-t1.
0419       k = (imag(tstationary) ~= 0) | ...
0420         (real(tstationary) <= 0) | ...
0421         (real(tstationary) >= (t2 - t1));
0422       tstationary(k) = [];
0423       
0424       % for any solutions that remain, compute the distance.
0425       if ~isempty(tstationary)
0426         mindist = zeros(size(tstationary));
0427         xyij = zeros(numel(tstationary),p);
0428         for k = 1:p
0429           xyij(:,k) = polyval(ppsegs{k}(j,:),tstationary);
0430           mindist = mindist + (mapxy(i,k) - xyij(:,k)).^2;
0431         end
0432         mindist = sqrt(mindist);
0433         % just in case there is more than one stationary point
0434         [mindist,ind] = min(mindist);
0435         
0436         if mindist < distance(i)
0437           % we found a point on this segment that is better
0438           % than the endpoint values for that segment.
0439           distance(i) = mindist;
0440           xy(i,:) = xyij(ind,:);
0441           t(i) = tstationary(ind) + t0(j);
0442         end
0443       end % if ~isempty(tstationary)
0444     end % for i = 1:n
0445   end % for j = 1:(n-1)
0446   
0447   % do we need to return t_a? t_a is the same number that interparc
0448   % uses, whereas t as we have computed it so far is just the fractional
0449   % chordal arclength.
0450   %
0451   % Don't bother doing this last piece unless that argument is requested,
0452   % since it takes some additional work to do.
0453   if nargout >= 2
0454     % build new piecewise polynomials for each segment that
0455     % represent (dx/dt)^2 + (dy/dt)^2 + ...
0456     %
0457     % Since each poly must be cubic at this point, the result will be
0458     % a 4th degree piecewise polynomial.
0459     kernelcoefs = zeros(nbr-1,5);
0460     for i = 1:p
0461       ppi = ppsegs{i};
0462       kernelcoefs = kernelcoefs + [9*ppi(:,1).^2, ...
0463         12*ppi(:,1).*ppi(:,2), ...
0464         4*ppi(:,2).^2 + 6*ppi(:,1).*ppi(:,3), ...
0465         4*ppi(:,2).*ppi(:,3), ppi(:,3).^2];
0466     end
0467     
0468     % get the arc length for each segment. quadgk will suffice here
0469     % since we need to integrate the sqrt of each poly
0470     arclengths = zeros(nbr-1,1);
0471     for i = 1:(nbr - 1)
0472       lengthfun = @(t) sqrt(polyval(kernelcoefs(i,:),t));
0473       arclengths(i) = quadgk(lengthfun,0,t0(i+1) - t0(i));
0474     end
0475     
0476     % get the cumulative arclengths, then scale by the sum
0477     % this gives us fractional arc lengths.
0478     arclengths = cumsum(arclengths);
0479     totallength = arclengths(end);
0480     arclengths = [0;arclengths/totallength];
0481     
0482     % where does each point fall in terms of fractional cumulative
0483     % chordal arclength? (i.e., t0?)
0484     [tbin,tbin] = histc(t,t0);
0485     tbin(tbin < 1) = 1; % being careful at the bottom end
0486     tbin(tbin >= nbr) = nbr - 1; % if the point fell at the very top...
0487     
0488     % the total length below the segment in question
0489     t_a = arclengths(tbin);
0490     % now get the piece in the tbin segment
0491     for i = 1:m
0492       lengthfun = @(t) sqrt(polyval(kernelcoefs(tbin(i),:),t));
0493       t_a(i) = t_a(i) + quadgk(lengthfun,0,t(i) - t0(tbin(i)))/totallength;
0494     end
0495     
0496   end
0497   
0498 end % if strcmp(interpmethod,'linear');
0499 
0500 
0501 % ==========================================================
0502 function d = ipdm(data1,varargin)
0503 % ipdm: Inter-Point Distance Matrix
0504 % usage: d = ipdm(data1)
0505 % usage: d = ipdm(data1,data2)
0506 % usage: d = ipdm(data1,prop,value)
0507 % usage: d = ipdm(data1,data2,prop,value)
0508 %
0509 % Arguments: (input)
0510 %  data1 - array of data points, each point is one row. p dimensional
0511 %          data will be represented by matrix with p columns.
0512 %          If only data1 is provided, then the distance matrix
0513 %          is computed between all pairs of rows of data1.
0514 %
0515 %          If your data is one dimensional, it MUST form a column
0516 %          vector. A row vector of length n will be interpreted as
0517 %          an n-dimensional data set.
0518 %
0519 %  data2 - second array, supplied only if distances are to be computed
0520 %          between two sets of points.
0521 %
0522 %
0523 % Class support: data1 and data2 are assumed to be either
0524 % single or double precision. I have not tested this code to
0525 % verify its success on integer data of any class.
0526 %
0527 %
0528 % Additional parameters are expected to be property/value pairs.
0529 % Property/value pairs are pairs of arguments, the first of which
0530 % (properties) must always be a character string. These strings
0531 % may be shortened as long as the shortening is unambiguous.
0532 % Capitalization is ignored. Valid properties for ipdm are:
0533 %
0534 %  'Metric', 'Subset', 'Limit', 'Result'
0535 %
0536 %  'Metric' - numeric flag - defines the distance metric used
0537 %          metric = 2 --> (DEFAULT) Euclidean distance = 2-norm
0538 %                         The standard distance metric.
0539 %
0540 %          metric = 1 --> 1-norm = sum of absolute differences
0541 %                         Also sometimes known as the "city block
0542 %                         metric", since this is the sum of the
0543 %                         differences in each dimension.
0544 %
0545 %          metric = inf --> infinity-norm = maximum difference
0546 %                         over all dimensions. The name refers
0547 %                         to the limit of the p-norm, as p
0548 %                         approaches infinity.
0549 %
0550 %          metric = 0 --> minimum difference over all dimensions.
0551 %                         This is not really a useful norm in
0552 %                         practice.
0553 %
0554 %          Note: while other distance metrics exist, IMHO, these
0555 %          seemed to be the common ones.
0556 %
0557 %
0558 %  'Result' - A string variable that denotes the style of returned
0559 %          result. Valid result types are 'Array', 'Structure'.
0560 %          Capitalization is ignored, and the string may be
0561 %          shortened if you wish.
0562 %
0563 %          result = 'Array' --> (DEFAULT) A matrix of all
0564 %                         interpoint distances will be generated.
0565 %                         This array may be large. If this option
0566 %                         is specified along with a minimum or
0567 %                         maximum value, then those elements above
0568 %                         or below the limiting values will be
0569 %                         set as -inf or +inf, as appropriate.
0570 %
0571 %                         When any of 'LargestFew', 'SmallestFew',
0572 %                         or 'NearestNeighbor' are set, then the
0573 %                         resulting array will be a sparse matrix
0574 %                         if 'array' is specified as the result.
0575 %
0576 %          result = 'Structure' --> A list of all computed distances,
0577 %                         defined as a structure. This structure
0578 %                         will have fields named 'rowindex',
0579 %                         'columnindex', and 'distance'.
0580 %
0581 %                         This option will be useful when a subset
0582 %                         criterion for the distances has been
0583 %                         specified, since then the distance matrix
0584 %                         may be very sparsely populated. Distances
0585 %                         for pairs outside of the criterion will
0586 %                         not be returned.
0587 %
0588 %
0589 %  'Subset' - Character string, any of:
0590 %
0591 %          'All', 'Maximum', 'Minimum', 'LargestFew', 'SmallestFew',
0592 %          'NearestNeighbor', 'FarthestNeighbor', or empty
0593 %
0594 %          Like properties, capitalization is ignored here, and
0595 %          any unambiguous shortening of the word is acceptable.
0596 %
0597 %          DEFAULT = 'All'
0598 %
0599 %          Some interpoint distance matrices can be huge. Often
0600 %          these matrices are too large to be fully retained in
0601 %          memory, yet only the pair of points with the largest
0602 %          or smallest distance may be needed. When only some
0603 %          subset of the complete set of distances is of interest,
0604 %          these options allow you to specify which distances will
0605 %          be returned.
0606 %
0607 %          If 'result' is defined to be an array, then a sparse
0608 %          matrix will be returned for the 'LargestFew', 'SmallestFew',
0609 %          'NearestNeighbor', and 'FarthestNeighbor' subset classes.
0610 %          'Minimum' and 'Maximum' will yield full matrices by
0611 %          default. If a structure is specified, then only those
0612 %          elements which have been identified will be returned.
0613 %
0614 %          Where a subset is specified, its limiting value is
0615 %          specified by the 'Limit' property. Call that value k.
0616 %
0617 %
0618 %          'All' -->     (DEFAULT) Return all interpoint distances
0619 %
0620 %          'Minimum' --> Only look for those distances above
0621 %                        the cutoff k. All other distances will
0622 %                        be returned as -inf.
0623 %
0624 %          'Maximum' --> Only look for those distances below
0625 %                        the cutoff k. All other distances will
0626 %                        be returned as +inf.
0627 %
0628 %          'SmallestFew' --> Only return the subset of the k
0629 %                        smallest distances. Where only one data
0630 %                        set is provided, only the upper triangle
0631 %                        of the inter-point distance matrix will
0632 %                        be generated since that matrix is symmetric.
0633 %
0634 %          'LargestFew' --> Only return the subset of the k
0635 %                        largest distances. Where only one data
0636 %                        set is provided, only the upper triangle
0637 %                        of the inter-point distance matrix will
0638 %                        be generated since that matrix is symmetric.
0639 %
0640 %          'NearestNeighbor' --> Only return the single nearest
0641 %                        neighbor in data2 to each point in data1.
0642 %                        No limiting value is required for this
0643 %                        option. If multiple points have the same
0644 %                        nearest distance, then return the first
0645 %                        such point found. With only one input set,
0646 %                        a point will not be its own nearest
0647 %                        neighbor.
0648 %
0649 %                        Note that exact replicates in a single set
0650 %                        will cause problems, since a sparse matrix
0651 %                        is returned by default. Since they will have
0652 %                        a zero distance, they will not show up in
0653 %                        the sparse matrix. A structure return will
0654 %                        show those points as having a zero distance
0655 %                        though.
0656 %
0657 %          'FarthestNeighbor' --> Only return the single farthest
0658 %                        neighbor to each point. No limiting value
0659 %                        is required for this option. If multiple
0660 %                        points have the same farthest distance,
0661 %                        then return the first such point found.
0662 %
0663 %
0664 %  'Limit' - scalar numeric value or []. Used only when some
0665 %           Subset is specified.
0666 %
0667 %          DEFAULT = []
0668 %
0669 %
0670 %  'ChunkSize' - allows a user with lower RAM limits
0671 %          to force the code to only grab smaller chunks of RAM
0672 %          at a time (where possible). This parameter is specified
0673 %          in bytes of RAM. The default is 32 megabytes, or 2^22
0674 %          elements in any piece of the distance matrix. Only some
0675 %          options will break the problem into chunks, thus as long
0676 %          as a full matrix is expected to be returned, there seems
0677 %          no reason to break the problem up into pieces.
0678 %
0679 %          DEFAULT = 2^25
0680 %
0681 %
0682 % Arguments: (output)
0683 %  d     - array of interpoint distances, or a struct wth the
0684 %          fields {'rowindex', 'columnindex', 'distance'}.
0685 %
0686 %          d(i,j) represents the distance between point i
0687 %          (from data1) and point j (from data2).
0688 %
0689 %          If only one (n1 x p) array is supplied, then d will
0690 %          be an array of size == [n1,n1].
0691 %
0692 %          If two arrays (of sizes n1 x p and n2 x p) then d
0693 %          will be an array of size == [n1,n2].
0694 %
0695 %
0696 % Efficiency considerations:
0697 %  Where possible, this code will use bsxfun to compute its
0698 %  distances.
0699 %
0700 %
0701 % Example:
0702 %  Compute the interpoint distances between all pairs of points
0703 %  in a list of 5 points, in 2 dimensions and using Euclidean
0704 %  distance as the distance metric.
0705 %
0706 %  A = randn(5,2);
0707 %  d = ipdm(A,'metric',2)
0708 %  d =
0709 %            0       2.3295       3.2263       2.0263       2.8244
0710 %       2.3295            0       1.1485      0.31798       1.0086
0711 %       3.2263       1.1485            0       1.4318       1.8479
0712 %       2.0263      0.31798       1.4318            0       1.0716
0713 %       2.8244       1.0086       1.8479       1.0716            0
0714 %
0715 % (see the demo file for many other examples)
0716 %
0717 % See also: pdist
0718 %
0719 % Author: John D'Errico
0720 % e-mail: woodchips@rochester.rr.com
0721 % Release: 1.0
0722 % Release date: 2/26/08
0723 
0724 % Default property values
0725 params.Metric = 2;
0726 params.Result = 'array';
0727 params.Subset = 'all';
0728 params.Limit = [];
0729 params.ChunkSize = 2^25;
0730 
0731 % untangle the arguments
0732 if nargin<1
0733   % if called with no arguments, then the user probably
0734   % needs help. Give it to them.
0735   help ipdm
0736   return
0737 end
0738 
0739 % were two sets of data provided?
0740 pvpairs = {};
0741 if nargin==1
0742   % only 1 set of data provided
0743   dataflag = 1;
0744   data2 = [];
0745 else
0746   if ischar(varargin{1})
0747     dataflag = 1;
0748     data2 = [];
0749     pvpairs = varargin;
0750   else
0751     dataflag = 2;
0752     data2 = varargin{1};
0753     if nargin>2
0754       pvpairs = varargin(2:end);
0755     end
0756   end
0757 end
0758 
0759 % get data sizes for later
0760 [n1,dim] = size(data1);
0761 if dataflag == 2
0762   n2 = size(data2,1);
0763 end
0764 
0765 % Test the class of the input variables
0766 if ~(isa(data1,'double') || isa(data1,'single')) || ...
0767     ((dataflag == 2) && ~(isa(data2,'double') || isa(data2,'single')))
0768   error('data points must be either single or double precision variables.')
0769 end
0770 
0771 % do we need to process any property/value pairs?
0772 if nargin>2
0773   params = parse_pv_pairs(params,pvpairs);
0774   
0775   % check for problems in the properties
0776   
0777   % was a legal Subset provided?
0778   if ~isempty(params.Subset) && ~ischar(params.Subset)
0779     error('If provided, ''Subset'' must be character')
0780   elseif isempty(params.Subset)
0781     params.Subset = 'all';
0782   end
0783   valid = {'all','maximum','minimum','largestfew','smallestfew', ...
0784     'nearestneighbor','farthestneighbor'};
0785   ind = find(strncmpi(params.Subset,valid,length(params.Subset)));
0786   if (length(ind)==1)
0787     params.Subset = valid{ind};
0788   else
0789     error(['Invalid Subset: ',params.Subset])
0790   end
0791   
0792   % was a limit provided?
0793   if ~ismember(params.Subset,{'all','nearestneighbor','farthestneighbor'}) && ...
0794       isempty(params.Limit)
0795     error('No limit provided, but a Subset that requires a limit value was specified')
0796   end
0797   % check the limit values for validity
0798   if length(params.Limit)>1
0799     error('Limit must be scalar or empty')
0800   end
0801   
0802   switch params.Subset
0803     case {'largestfew', 'smallestfew'}
0804       % must be at least 1, and an integer
0805       if (params.Limit<1) || (round(params.Limit)~=params.Limit)
0806         error('Limit must be a positive integer for LargestFew or NearestFew')
0807       end
0808   end
0809   
0810   % was a legal Result provided?
0811   if isempty(params.Result)
0812     params.result = 'Array';
0813   elseif ~ischar(params.Result)
0814     error('If provided, ''Result'' must be character or empty')
0815   end
0816   valid = {'array','structure'};
0817   ind = find(strncmpi(params.Result,valid,length(params.Result)));
0818   if (length(ind)==1)
0819     params.Result = valid{ind};
0820   else
0821     error(['Invalid Result: ',params.Subset])
0822   end
0823 
0824   % check for the metric
0825   if isempty(params.Metric)
0826     params.Metric = 2;
0827   elseif (length(params.Metric)~=1) || ~ismember(params.Metric,[0 1 2 inf])
0828     error('If supplied, ''Metric'' must be a scalar, and one of [0 1 2 inf]')
0829   end
0830 end % if nargin>2
0831   
0832 % If Metric was given as 2, but the dimension is only 1, then it will
0833 % be slightly faster (and equivalent) to use the 1-norm Metric.
0834 if (dim == 1) && (params.Metric == 2)
0835   params.Metric = 1;
0836 end
0837 
0838 % Can we use bsxfun to compute the interpoint distances?
0839 % Older Matlab releases will not have bsxfun, but if it is
0840 % around, it will ne both faster and less of a memory hog.
0841 params.usebsxfun = (5==exist('bsxfun','builtin'));
0842 
0843 % check for dimension mismatch if 2 sets
0844 if (dataflag==2) && (size(data2,2)~=dim)
0845   error('If 2 point sets provided, then both must have the same number of columns')
0846 end
0847 
0848 % Total number of distances to compute, in case I must do it in batches
0849 if dataflag==1
0850   n2 = n1;
0851 end
0852 ntotal = n1*n2;
0853 
0854 % FINALLY!!! Compute inter-point distances
0855 switch params.Subset
0856   case 'all'
0857     % The complete set of interpoint distances. There is no need
0858     % to break this into chunks, since we must return all distances.
0859     % If that is too much to compute in memory, then it will fail
0860     % anyway when we try to store the result. bsxfun will at least
0861     % do the computation efficiently.
0862     
0863     % One set or two?
0864     if dataflag == 1
0865       d = distcomp(data1,data1,params);
0866     else
0867         d = distcomp(data1,data2,params);
0868     end
0869     
0870     % Must we return it as a struct?
0871     if params.Result(1) == 's'
0872       [rind,cind] = ndgrid(1:size(d,1),1:size(d,2));
0873       ds.rowindex = rind(:);
0874       ds.columnindex = cind(:);
0875       ds.distance = d(:);
0876       d = ds;
0877     end
0878     
0879   case {'minimum' 'maximum'}
0880     % There is no reason to break this into pieces if the result
0881     % sill be filled in the end with +/- inf. Only break it up
0882     % if the final result is a struct.
0883     if ((ntotal*8)<=params.ChunkSize) || (params.Result(1) == 'a')
0884       % its small enough to do it all at once
0885       
0886       % One set or two?
0887       if dataflag == 1
0888         d = distcomp(data1,data1,params);
0889       else
0890           d = distcomp(data1,data2,params);
0891       end
0892       
0893       % Must we return it as a struct?
0894       if params.Result(1) == 'a'
0895         % its an array, fill the unwanted distances with +/- inf
0896         if params.Subset(2) == 'i'
0897           % minimum
0898           d(d<=params.Limit) = -inf;
0899         else
0900           % maximum
0901           d(d>=params.Limit) = +inf;
0902         end
0903       else
0904         % a struct will be returned
0905         if params.Subset(2) == 'i'
0906           % minimum
0907           [dist.rowindex,dist.columnindex] = find(d>=params.Limit);
0908         else
0909           % maximum
0910           [dist.rowindex,dist.columnindex] = find(d<=params.Limit);
0911         end
0912         dist.distance = d(dist.rowindex + n1*(dist.columnindex-1));
0913         d = dist;
0914       end
0915       
0916     else
0917       % we need to break this into chunks. This branch
0918       % will always return a struct.
0919       
0920       % this is the number of rows of data1 that we will
0921       % process at a time.
0922       bs = floor(params.ChunkSize/(8*n2));
0923       bs = min(n1,max(1,bs));
0924       
0925       % Accumulate the result into a cell array. Do it this
0926       % way because we don't know in advance how many elements
0927       % that we will find satisfying the minimum or maximum
0928       % limit specified.
0929       accum = cell(0,1);
0930       
0931       % now loop over the chunks
0932       batch = 1:bs;
0933       while ~isempty(batch)
0934         
0935         % One set or two?
0936         if dataflag == 1
0937           dist = distcomp(data1(batch,:),data1,params);
0938         else
0939           dist = distcomp(data1(batch,:),data2,params);
0940         end
0941         
0942         % big or small as requested
0943         if ('i'==params.Subset(2))
0944           % minimum value specified
0945           [I,J,V] = find(dist>=params.Limit);
0946         else
0947           % maximum limit
0948           [I,J] = find(dist<=params.Limit);
0949           I = I(:);
0950           J = J(:);
0951           V = dist(I + (J-1)*length(batch));
0952           I = I + (batch(1)-1);
0953         end
0954 
0955         % and stuff them into the cell structure
0956         if ~isempty(V)
0957           accum{end+1,1} = [I,J,V(:)]; %#ok
0958         end
0959 
0960         % increment the batch
0961         batch = batch + bs;
0962         if batch(end)>n1
0963           batch(batch>n1) = [];
0964         end
0965 
0966       end
0967 
0968       % convert the cells into one flat array
0969       accum = cell2mat(accum);
0970 
0971       if isempty(accum)
0972         d.rowindex = [];
0973         d.columnindex = [];
0974         d.distance = [];
0975       else
0976         % we found something
0977 
0978         % sort on the second column, to put them in a reasonable order
0979         accum = sortrows(accum,[2 1]);
0980 
0981         d.rowindex = accum(:,1);
0982         d.columnindex = accum(:,2);
0983         d.distance = accum(:,3);
0984       end
0985 
0986     end
0987 
0988   case {'smallestfew' 'largestfew'}
0989     % find the k smallest/largest distances. k is
0990     % given by params.Limit
0991 
0992     % if only 1 set, params.Limit must be less than n*(n-1)/2
0993     if dataflag == 1
0994       params.Limit = min(params.Limit,n1*(n1-1)/2);
0995     end
0996 
0997     % is this a large problem?
0998     if ((ntotal*8) <= params.ChunkSize)
0999       % small potatoes
1000 
1001       % One set or two?
1002       if dataflag == 1
1003         dist = distcomp(data1,data1,params);
1004         % if only one data set, set the diagonal and
1005         % below that to +/- inf so we don't find it.
1006         temp = find(tril(ones(n1,n1),0));
1007         if params.Subset(1) == 's'
1008           dist(temp) = inf;
1009         else
1010           dist(temp) = -inf;
1011         end
1012       else
1013         dist = distcomp(data1,data2,params);
1014       end
1015 
1016       % sort the distances to find those we need
1017       if ('s'==params.Subset(1))
1018         % smallestfew
1019         [val,tags] = sort(dist(:),'ascend');
1020       else
1021         % largestfew
1022         [val,tags] = sort(dist(:),'descend');
1023       end
1024       val = val(1:params.Limit);
1025       tags = tags(1:params.Limit);
1026 
1027       % recover the row and column index from the linear
1028       % index returned by sort in tags.
1029       [d.rowindex,d.columnindex] = ind2sub([n1,size(dist,2)],tags);
1030 
1031       % create the matrix as a sparse one or a struct?
1032       if params.Result(1)=='a'
1033         % its an array, so make the array sparse.
1034         d = sparse(d.rowindex,d.columnindex,val,n1,size(dist,2));
1035       else
1036         % a structure
1037         d.distance = val;
1038       end
1039 
1040     else
1041       % chunks
1042 
1043       % this is the number of rows of data1 that we will
1044       % process at a time.
1045       bs = floor(params.ChunkSize/(8*n2));
1046       bs = min(n1,max(1,bs));
1047 
1048       % We need to find the extreme cases. There are two possible
1049       % algorithms, depending on how many total elements we will
1050       % search for.
1051       % 1. Only a very few total elements.
1052       % 2. A relatively large number of total elements, forming
1053       %    a significant fraction of the total set.
1054       %
1055       % Case #1 would suggest to retain params.Limit numberr of
1056       % elements from each batch, then at the end, sort them all
1057       % to find the best few. Case #2 will result in too many
1058       % elements to retain, so we must distinguish between these
1059       % alternatives.
1060       if (8*params.Limit*n1/bs) <= params.ChunkSize
1061         % params.Limit is small enough to fall into case #1.
1062 
1063         % Accumulate the result into a cell array. Do it this
1064         % way because we don't know in advance how many elements
1065         % that we will find satisfying the minimum or maximum
1066         % limit specified.
1067         accum = cell(0,1);
1068 
1069         % now loop over the chunks
1070         batch = (1:bs)';
1071         while ~isempty(batch)
1072           % One set or two?
1073           if dataflag == 1
1074             dist = distcomp(data1(batch,:),data1,params);
1075             k = find(tril(ones(length(batch),n2),batch(1)-1));
1076             if ('s'==params.Subset(1))
1077               dist(k) = inf;
1078             else
1079               dist(k) = -inf;
1080             end
1081           else
1082             dist = distcomp(data1(batch,:),data2,params);
1083           end
1084 
1085           % big or small as requested, keeping only the best
1086           % params.Limit number of elements
1087           if ('s'==params.Subset(1))
1088             % minimum value specified
1089             [tags,tags] = sort(dist(:),1,'ascend'); %#ok
1090             tags = tags(1:bs);
1091             [I,J] = ndgrid(batch,1:n2);
1092             ijv = [I(tags),J(tags),dist(tags)];
1093           else
1094             % maximum limit
1095             [tags,tags] = sort(dist(:),1,'descend'); %#ok
1096             tags = tags(1:bs);
1097             [I,J] = ndgrid(batch,1:n2);
1098             ijv = [I(tags),J(tags),dist(tags)];
1099           end
1100           % and stuff them into the cell structure
1101           accum{end+1,1} = ijv; %#ok
1102 
1103           % increment the batch
1104           batch = batch + bs;
1105           if batch(end)>n1
1106             batch(batch>n1) = [];
1107           end
1108         end
1109 
1110         % convert the cells into one flat array
1111         accum = cell2mat(accum);
1112 
1113         % keep only the params.Limit best of those singled out
1114         accum = sortrows(accum,3);
1115         if ('s'==params.Subset(1))
1116           % minimum value specified
1117           accum = accum(1:params.Limit,:);
1118         else
1119           % minimum value specified
1120           accum = accum(end + 1 - (1:params.Limit),:);
1121         end
1122         d.rowindex = accum(:,1);
1123         d.columnindex = accum(:,2);
1124         d.distance = accum(:,3);
1125 
1126         % create the matrix as a sparse one or a struct?
1127         if params.Result(1)=='a'
1128           % its an array, so make the array sparse.
1129           d = sparse(d.rowindex,d.columnindex,d.distance,n1,size(dist,2));
1130         end
1131 
1132       else
1133         % params.Limit forces us into the domain of case #2.
1134         % Here we cannot retain params.Limit elements from each chunk.
1135         % so we will grab each chunk and append it to the best elements
1136         % found so far, then filter out the best after each chunk is
1137         % done. This may be slower than we want, but its the only way.
1138         ijv = zeros(0,3);
1139 
1140         % loop over the chunks
1141         batch = (1:bs)';
1142         while ~isempty(batch)
1143           % One set or two?
1144           if dataflag == 1
1145             dist = distcomp(data1(batch,:),data1,params);
1146             k = find(tril(ones(length(batch),n2),batch(1)-1));
1147             if ('s'==params.Subset(1))
1148               dist(k) = inf;
1149             else
1150               dist(k) = -inf;
1151             end
1152           else
1153             dist = distcomp(data1(batch,:),data2,params);
1154           end
1155 
1156           [I,J] = ndgrid(batch,1:n2);
1157           ijv = [ijv;[I(:),J(:),dist(:)]]; %#ok
1158 
1159           % big or small as requested, keeping only the best
1160           % params.Limit number of elements
1161           if size(ijv,1) > params.Limit
1162             if ('s'==params.Subset(1))
1163               % minimum value specified
1164               [tags,tags] = sort(ijv(:,3),1,'ascend'); %#ok
1165             else
1166               [tags,tags] = sort(ijv(:,3),1,'ascend'); %#ok
1167             end
1168             ijv = ijv(tags(1:params.Limit),:);
1169           end
1170 
1171           % increment the batch
1172           batch = batch + bs;
1173           if batch(end)>n1
1174             batch(batch>n1) = [];
1175           end
1176         end
1177 
1178         % They are fully trimmed down. stuff a structure
1179         d.rowindex = ijv(:,1);
1180         d.columnindex = ijv(:,2);
1181         d.distance = ijv(:,3);
1182 
1183         % create the matrix as a sparse one or a struct?
1184         if params.Result(1)=='a'
1185           % its an array, so make the array sparse.
1186           d = sparse(d.rowindex,d.columnindex,d.distance,n1,size(dist,2));
1187         end
1188 
1189       end
1190 
1191     end
1192     
1193   case {'nearestneighbor' 'farthestneighbor'}
1194     % find the closest/farthest neighbor for every point
1195 
1196     % is this a large problem? Or a 1-d problem?
1197     if dim == 1
1198       % its a 1-d nearest/farthest neighbor problem. we can
1199       % special case these easily enough, and all the distance
1200       % metric options are the same in 1-d.
1201 
1202       % first split it into the farthest versus nearest cases.
1203       if params.Subset(1) == 'f'
1204         % farthest away
1205 
1206         % One set or two?
1207         if dataflag == 1
1208           [d2min,minind] = min(data1);
1209           [d2max,maxind] = max(data1);
1210         else
1211           [d2min,minind] = min(data2);
1212           [d2max,maxind] = max(data2);
1213         end
1214 
1215         d.rowindex = (1:n1)';
1216         d.columnindex = repmat(maxind,n1,1);
1217         d.distance = repmat(d2max,n1,1);
1218 
1219         % which endpoint was further away?
1220         k = abs((data1 - d2min)) >= abs((data1 - d2max));
1221         if any(k)
1222           d.columnindex(k) = minind;
1223           d.distance(k) = d2min;
1224         end
1225 
1226       else
1227         % nearest. this is mainly a sort and some fussing around.
1228         d.rowindex = (1:n1)';
1229         d.columnindex = ones(n1,1);
1230         d.distance = zeros(n1,1);
1231 
1232         % One set or two?
1233         if dataflag == 1
1234           % if only one data point, then we are done
1235           if n1 == 2
1236             % if exactly two data points, its trivial
1237             d.columnindex = [2 1];
1238             d.distance = repmat(abs(diff(data1)),2,1);
1239           elseif n1>2
1240             % at least three points. do a sort.
1241             [sorted_data,tags] = sort(data1);
1242 
1243             % handle the first and last points separately
1244             d.columnindex(tags(1)) = tags(2);
1245             d.distance(tags(1)) = sorted_data(2) - sorted_data(1);
1246             d.columnindex(tags(end)) = tags(end-1);
1247             d.distance(tags(end)) = sorted_data(end) - sorted_data(end-1);
1248 
1249             ind = (2:(n1-1))';
1250 
1251             d1 = sorted_data(ind) - sorted_data(ind-1);
1252             d2 = sorted_data(ind+1) - sorted_data(ind);
1253 
1254             k = d1 < d2;
1255             d.distance(tags(ind(k))) = d1(k);
1256             d.columnindex(tags(ind(k))) = tags(ind(k)-1);
1257             k = ~k;
1258             d.distance(tags(ind(k))) = d2(k);
1259             d.columnindex(tags(ind(k))) = tags(ind(k)+1);
1260           end % if n1 == 2
1261         else
1262           % Two sets of data. still really a sort and some fuss.
1263           if n2 == 1
1264             % there is only one point in data2
1265             d.distance = abs(data1 - data2);
1266             % d.columnindex is already set correctly
1267           else
1268             % At least two points in data2
1269             % We need to sort all the data points together, but also
1270             % know which points from each set went where. ind12 and
1271             % bool12 will help keep track.
1272             ind12 = [1:n1,1:n2]';
1273             bool12 = [zeros(n1,1);ones(n2,1)];
1274             [sorted_data,tags] = sort([data1;data2]);
1275 
1276             ind12 = ind12(tags);
1277             bool12 = bool12(tags);
1278 
1279             % where did each point end up after the sort?
1280             loc1 = find(~bool12);
1281             loc2 = find(bool12);
1282 
1283             % for each point in data1, what is the (sorted) data2
1284             % element which appears most nearly to the left of it?
1285             cs = cumsum(bool12);
1286             leftelement = cs(loc1);
1287 
1288             % any points which fell below the minimum element in data2
1289             % will have a zero for the index of the element on their
1290             % left. fix this.
1291             leftelement = max(1,leftelement);
1292 
1293             % likewise, any point greater than the max in data2 will
1294             % have an n2 in left element. this too will be a problem
1295             % later, so fix it.
1296             leftelement = min(n2-1,leftelement);
1297 
1298             % distance to the left hand element
1299             dleft = abs(sorted_data(loc1) - sorted_data(loc2(leftelement)));
1300             dright = abs(sorted_data(loc1) - sorted_data(loc2(leftelement+1)));
1301 
1302             % find the points which are closer to the left element in data2
1303             k = (dleft < dright);
1304             d.distance(ind12(loc1(k))) = dleft(k);
1305             d.columnindex(ind12(loc1(k))) = ind12(loc2(leftelement(k)));
1306             k = ~k;
1307             d.distance(ind12(loc1(k))) = dright(k);
1308             d.columnindex(ind12(loc1(k))) = ind12(loc2(leftelement(k)+1));
1309 
1310           end % if n2 == 1
1311         end % if dataflag == 1
1312       end % if params.Subset(1) == 'f'
1313 
1314       % create the matrix as a sparse one or a struct?
1315       if params.Result(1)=='a'
1316         % its an array, so make the array sparse.
1317         d = sparse(d.rowindex,d.columnindex,d.distance,n1,n2);
1318       end
1319 
1320     elseif (ntotal>1000) && (((params.Metric == 0) && (params.Subset(1) == 'n')) || ...
1321         ((params.Metric == inf) && (params.Subset(1) == 'f')))
1322       % nearest/farthest neighbour in n>1 dimensions, but for an
1323       % infinity norm metric. Reduce this to a sequence of
1324       % 1-d problems, each of which will be faster in general.
1325       % do this only if the problem is moderately large, since
1326       % we must overcome the extra overhead of the recursive
1327       % calls to ipdm.
1328 
1329       % do the first dimension
1330       if dataflag == 1
1331         d = ipdm(data1(:,1),data1(:,1),'subset',params.Subset,'metric',params.Metric,'result','struct');
1332       else
1333         d = ipdm(data1(:,1),data2(:,1),'subset',params.Subset,'metric',params.Metric,'result','struct');
1334       end
1335 
1336       % its slightly different for nearest versus farthest here
1337       % now, loop over dimensions
1338       for i = 2:dim
1339         if dataflag == 1
1340           di = ipdm(data1(:,i),data1(:,i),'subset',params.Subset,'metric',params.Metric,'result','struct');
1341         else
1342           di = ipdm(data1(:,i),data2(:,i),'subset',params.Subset,'metric',params.Metric,'result','struct');
1343         end
1344 
1345         % did any of the distances change?
1346         if params.Metric == 0
1347           % the 0 norm, with nearest neighbour, so take the
1348           % smallest distance in any dimension.
1349           k = d.distance > di.distance;
1350         else
1351           % inf norm. so take the largest distance across dimensions
1352           k = d.distance < di.distance;
1353         end
1354 
1355         if any(k)
1356           d.distance(k) = di.distance(k);
1357           d.columnindex(k) = di.columnindex(k);
1358         end
1359       end
1360 
1361       % create the matrix as a sparse one or a struct?
1362       if params.Result(1)=='a'
1363         % its an array, so make the array sparse.
1364         d = sparse(d.rowindex,d.columnindex,d.distance,n1,n2);
1365       end
1366 
1367     elseif ((ntotal*8) <= params.ChunkSize)
1368       % None of the other special cases apply, so do it using brute
1369       % force for the small potatoes problem.
1370 
1371       % One set or two?
1372       if dataflag == 1
1373         dist = distcomp(data1,data1,params);
1374       else
1375         dist = distcomp(data1,data2,params);
1376       end
1377 
1378       % if only one data set and if a nearest neighbor
1379       % problem, set the diagonal to +inf so we don't find it.
1380       if (dataflag==1) && (n1>1) && ('n'==params.Subset(1))
1381         diagind = (1:n1) + (0:n1:(n1^2-1));
1382         dist(diagind) = +inf;
1383       end
1384 
1385       if ('n'==params.Subset(1))
1386         % nearest
1387         [val,j] = min(dist,[],2);
1388       else
1389         % farthest
1390         [val,j] = max(dist,[],2);
1391       end
1392 
1393       % create the matrix as a sparse one or a struct?
1394       if params.Result(1)=='a'
1395         % its an array, so make the array sparse.
1396         d = sparse((1:n1)',j,val,n1,size(dist,2));
1397       else
1398         % a structure
1399         d.rowindex = (1:n1)';
1400         d.columnindex = j;
1401         d.distance = val;
1402       end
1403 
1404     else
1405 
1406       % break it into chunks
1407       bs = floor(params.ChunkSize/(8*n2));
1408       bs = min(n1,max(1,bs));
1409 
1410       % pre-allocate the result
1411       d.rowindex = (1:n1)';
1412       d.columnindex = zeros(n1,1);
1413       d.distance = zeros(n1,1);
1414 
1415       % now loop over the chunks
1416       batch = 1:bs;
1417       while ~isempty(batch)
1418 
1419         % One set or two?
1420         if dataflag == 1
1421           dist = distcomp(data1(batch,:),data1,params);
1422         else
1423           dist = distcomp(data1(batch,:),data2,params);
1424         end
1425 
1426         % if only one data set and if a nearest neighbor
1427         % problem, set the diagonal to +inf so we don't find it.
1428         if (dataflag==1) && (n1>1) && ('n'==params.Subset(1))
1429           diagind = 1:length(batch);
1430           diagind = diagind + (diagind-2+batch(1))*length(batch);
1431           dist(diagind) = +inf;
1432         end
1433 
1434         % big or small as requested
1435         if ('n'==params.Subset(1))
1436           % nearest
1437           [val,j] = min(dist,[],2);
1438         else
1439           % farthest
1440           [val,j] = max(dist,[],2);
1441         end
1442 
1443         % and stuff them into the result structure
1444         d.columnindex(batch) = j;
1445         d.distance(batch) = val;
1446 
1447         % increment the batch
1448         batch = batch + bs;
1449         if batch(end)>n1
1450           batch(batch>n1) = [];
1451         end
1452 
1453       end
1454 
1455       % did we need to return a struct or an array?
1456       if params.Result(1) == 'a'
1457         % an array. make it a sparse one
1458         d = sparse(d.rowindex,d.columnindex,d.distance,n1,n2);
1459       end
1460 
1461     end % if dim == 1
1462 
1463 end  % switch params.Subset
1464 
1465 % End of mainline
1466 
1467 % ======================================================
1468 % begin subfunctions
1469 % ======================================================
1470 function d = distcomp(set1,set2,params)
1471 % Subfunction to compute all distances between two sets of points
1472 dim = size(set1,2);
1473 % can we take advantage of bsxfun?
1474 % Note: in theory, there is no need to loop over the dimensions. We
1475 % could Just let bsxfun do ALL the work, then wrap a sum around the
1476 % outside. In practice, this tends to create large intermediate
1477 % arrays, especially in higher numbers of dimensions. Its also when
1478 % we might gain here by use of a vectorized code. This will only be
1479 % a serious gain when the number of points is relatively small and
1480 % the dimension is large.
1481 if params.usebsxfun
1482   % its a recent enough version of matlab that we can
1483   % use bsxfun at all.
1484   n1 = size(set1,1);
1485   n2 = size(set2,1);
1486   if (dim>1) && ((n1*n2*dim)<=params.ChunkSize)
1487     % its a small enough problem that we might gain by full
1488     % use of bsxfun
1489     switch params.Metric
1490       case 2
1491         d = sum(bsxfun(@minus,reshape(set1,[n1,1,dim]),reshape(set2,[1,n2,dim])).^2,3);
1492       case 1
1493         d = sum(abs(bsxfun(@minus,reshape(set1,[n1,1,dim]),reshape(set2,[1,n2,dim]))),3);
1494       case inf
1495         d = max(abs(bsxfun(@minus,reshape(set1,[n1,1,dim]),reshape(set2,[1,n2,dim]))),[],3);
1496       case 0
1497         d = min(abs(bsxfun(@minus,reshape(set1,[n1,1,dim]),reshape(set2,[1,n2,dim]))),[],3);
1498     end
1499   else
1500     % too big, so that the ChunkSize will have been exceeded, or just 1-d
1501     if params.Metric == 2
1502       d = bsxfun(@minus,set1(:,1),set2(:,1)').^2;
1503     else
1504       d = abs(bsxfun(@minus,set1(:,1),set2(:,1)'));
1505     end
1506     for i=2:dim
1507       switch params.Metric
1508         case 2
1509           d = d + bsxfun(@minus,set1(:,i),set2(:,i)').^2;
1510         case 1
1511           d = d + abs(bsxfun(@minus,set1(:,i),set2(:,i)'));
1512         case inf
1513           d = max(d,abs(bsxfun(@minus,set1(:,i),set2(:,i)')));
1514         case 0
1515           d = min(d,abs(bsxfun(@minus,set1(:,i),set2(:,i)')));
1516       end
1517     end
1518   end
1519 else
1520   % Cannot use bsxfun. Sigh. Do things the hard (and slower) way.
1521   n1 = size(set1,1);
1522   n2 = size(set2,1);
1523   if params.Metric == 2
1524     % Note: While some people might use a different Euclidean
1525     % norm computation based on expanding the square of the
1526     % difference of two numbers, that computation is inherantly
1527     % inaccurate when implemented in floating point arithmetic.
1528     % While it might be faster, I won't use it here. Sorry.
1529     d = (repmat(set1(:,1),1,n2) - repmat(set2(:,1)',n1,1)).^2;
1530   else
1531     d = abs(repmat(set1(:,1),1,n2) - repmat(set2(:,1)',n1,1));
1532   end
1533   for i=2:dim
1534     switch params.Metric
1535       case 2
1536         d = d + (repmat(set1(:,i),1,n2) - repmat(set2(:,i)',n1,1)).^2;
1537       case 1
1538         d = d + abs(repmat(set1(:,i),1,n2) - repmat(set2(:,i)',n1,1));
1539       case inf
1540         d = max(d,abs(repmat(set1(:,i),1,n2) - repmat(set2(:,i)',n1,1)));
1541       case 0
1542         d = min(d,abs(repmat(set1(:,i),1,n2) - repmat(set2(:,i)',n1,1)));
1543     end
1544   end
1545 end
1546 % if 2 norm, then we must sqrt at the end
1547 if params.Metric==2
1548   d = sqrt(d);
1549 end
1550 
1551 % ==============================================================
1552 %    end main ipdm
1553 %    begin included function - parse_pv_pairs
1554 % ==============================================================
1555 function params=parse_pv_pairs(params,pv_pairs)
1556 % parse_pv_pairs: parses sets of property value pairs, allows defaults
1557 % usage: params=parse_pv_pairs(default_params,pv_pairs)
1558 %
1559 % arguments: (input)
1560 %  default_params - structure, with one field for every potential
1561 %             property/value pair. Each field will contain the default
1562 %             value for that property. If no default is supplied for a
1563 %             given property, then that field must be empty.
1564 %
1565 %  pv_array - cell array of property/value pairs.
1566 %             Case is ignored when comparing properties to the list
1567 %             of field names. Also, any unambiguous shortening of a
1568 %             field/property name is allowed.
1569 %
1570 % arguments: (output)
1571 %  params   - parameter struct that reflects any updated property/value
1572 %             pairs in the pv_array.
1573 %
1574 % Example usage:
1575 % First, set default values for the parameters. Assume we
1576 % have four parameters that we wish to use optionally in
1577 % the function examplefun.
1578 %
1579 %  - 'viscosity', which will have a default value of 1
1580 %  - 'volume', which will default to 1
1581 %  - 'pie' - which will have default value 3.141592653589793
1582 %  - 'description' - a text field, left empty by default
1583 %
1584 % The first argument to examplefun is one which will always be
1585 % supplied.
1586 %
1587 %   function examplefun(dummyarg1,varargin)
1588 %   params.Viscosity = 1;
1589 %   params.Volume = 1;
1590 %   params.Pie = 3.141592653589793
1591 %
1592 %   params.Description = '';
1593 %   params=parse_pv_pairs(params,varargin);
1594 %   params
1595 %
1596 % Use examplefun, overriding the defaults for 'pie', 'viscosity'
1597 % and 'description'. The 'volume' parameter is left at its default.
1598 %
1599 %   examplefun(rand(10),'vis',10,'pie',3,'Description','Hello world')
1600 %
1601 % params =
1602 %     Viscosity: 10
1603 %        Volume: 1
1604 %           Pie: 3
1605 %   Description: 'Hello world'
1606 %
1607 % Note that capitalization was ignored, and the property 'viscosity'
1608 % was truncated as supplied. Also note that the order the pairs were
1609 % supplied was arbitrary.
1610 
1611 npv = length(pv_pairs);
1612 n = npv/2;
1613 
1614 if n~=floor(n)
1615   error 'Property/value pairs must come in PAIRS.'
1616 end
1617 if n<=0
1618   % just return the defaults
1619   return
1620 end
1621 
1622 if ~isstruct(params)
1623   error 'No structure for defaults was supplied'
1624 end
1625 
1626 % there was at least one pv pair. process any supplied
1627 propnames = fieldnames(params);
1628 lpropnames = lower(propnames);
1629 for i=1:n
1630   p_i = lower(pv_pairs{2*i-1});
1631   v_i = pv_pairs{2*i};
1632 
1633   ind = strmatch(p_i,lpropnames,'exact');
1634   if isempty(ind)
1635     ind = find(strncmp(p_i,lpropnames,length(p_i)));
1636     if isempty(ind)
1637       error(['No matching property found for: ',pv_pairs{2*i-1}])
1638     elseif length(ind)>1
1639       error(['Ambiguous property name: ',pv_pairs{2*i-1}])
1640     end
1641   end
1642   p_i = propnames{ind};
1643 
1644   % override the corresponding default in params.
1645   % Use setfield for comptability issues with older releases.
1646   params = setfield(params,p_i,v_i); %#ok
1647 
1648 end
1649 
1650 
1651 
1652
distance2curve

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SUBFUNCTIONS

SOURCE CODE