@string{kdd = {ACM SIGKDD Conference on Knowledge Discovery and Data Mining}} @string{proc = {Proceedings of the}} @string{kddshort = {KDD}} @string{kdd00 = proc # { 6th } # kdd # { (} # kddshort # {'00)}} @string{scg = {Annual ACM Symposium on Computational Geometry}} @string{scgshort = {SCG}} @string{scg09 = proc # { 25th } # scg # { (} # scgshort # {'09)}} @string{soda = {Annual ACM-SIAM Symposium on Discrete Algorithms}} @string{sodashort = {SODA}} @string{soda07 = proc # { 18th } # soda # { (} # sodashort # {'07)}} @manual{postgres:9.1.3, Author = {{The PostgreSQL Global Development Group}}, Date-Added = {2012-07-26 21:00:33 +0000}, Date-Modified = {2012-07-26 21:02:06 +0000}, Title = {{PostgreSQL} 9.1.3 Documentation}, Url = {http://www.postgresql.org/docs/9.1}, Year = {2011}} @article{V84a, Abstract = {{Several new methods are presented for selecting n records at random without replacement from a file containing N records. Each algorithm selects the records for the sample in a sequential manner---in the same order the records appear in the file. The algorithms are online in that the records for the sample are selected iteratively with no preprocessing. The algorithms require a constant amount of space and are short and easy to implement. The main result of this paper is the design and analysis of Algorithm D, which does the sampling in O(n) time, on the average; roughly n uniform random variates are generated, and approximately n exponentiation operations (of the form ab, for real numbers a and b) are performed during the sampling. This solves an open problem in the literature. CPU timings on a large mainframe computer indicate that Algorithm D is significantly faster than the sampling algorithms in use today.}}, Author = {Vitter, Jeffrey Scott}, Date-Added = {2012-03-10 23:03:35 -0800}, Date-Modified = {2012-03-10 23:03:35 -0800}, Doi = {10.1145/358105.893}, Journal = {Communications of the ACM}, Keywords = {Sampling}, Number = {7}, Pages = {703--718}, Publisher = {ACM}, Title = {Faster methods for random sampling}, Volume = {27}, Year = {1984}, Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUIJidUJHRvcFgkb2JqZWN0c1gkdmVyc2lvblkkYXJjaGl2ZXLRBgdUcm9vdIABqAkKFRYXGyIjVSRudWxs0wsMDQ4RFFpOUy5vYmplY3RzV05TLmtleXNWJGNsYXNzog8QgASABqISE4ACgAOAB1lhbGlhc0RhdGFccmVsYXRpdmVQYXRo0hgNGRpXTlMuZGF0YU8RA4QAAAAAA4QAAgAACFJlc2VhcmNoAAAAAAAAAAAAAAAAAAAAAAAAAMmQMDVIKwAFAAAoxB9GYXN0ZXIgbWV0aG9kcyBmb3IgcmFuIzI4QzYucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACjGyJWU4wAAAAAAAAAA/////wAADQIAAAAAAAAAAAAAAAAAAAAGVml0dGVyABAACAAAyZCgtQAAABEACAAAyJX3UwAAAAEADAAAKMQAACfbAAAD4QACAEFSZXNlYXJjaDpTaGFyZWQ6TGl0ZXJhdHVyZTpWaXR0ZXI6RmFzdGVyIG1ldGhvZHMgZm9yIHJhbiMyOEM2LnBkZgAADgBOACYARgBhAHMAdABlAHIAIABtAGUAdABoAG8AZABzACAAZgBvAHIAIAByAGEAbgBkAG8AbQAgAHMAYQBtAHAAbABpAG4AZwAuAHAAZABmAA8AEgAIAFIAZQBzAGUAYQByAGMAaAASAEAvU2hhcmVkL0xpdGVyYXR1cmUvVml0dGVyL0Zhc3RlciBtZXRob2RzIGZvciByYW5kb20gc2FtcGxpbmcucGRmABMAIC9Vc2Vycy9zY2hvcGYvRG9jdW1lbnRzL1Jlc2VhcmNoABQBmAAAAAABmAACAAEMTWFjaW50b3NoIEhEAAAAAAAAAAAAAAAAAAAAx9Q1YUgrAAAAEPW+FVJlc2VhcmNoLnNwYXJzZWJ1bmRsZQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA7alzJkDArAAAAAAAAAAD/////AAAJIAAAAAAAAAAAAAAAAAAAAAlEb2N1bWVudHMAABAACAAAx9SX0QAAABEACAAAyZCgqwAAAAEADAAQ9b4AEPWxAACS8wACADlNYWNpbnRvc2ggSEQ6VXNlcnM6c2Nob3BmOkRvY3VtZW50czpSZXNlYXJjaC5zcGFyc2VidW5kbGUAAA4ALAAVAFIAZQBzAGUAYQByAGMAaAAuAHMAcABhAHIAcwBlAGIAdQBuAGQAbABlAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIALFVzZXJzL3NjaG9wZi9Eb2N1bWVudHMvUmVzZWFyY2guc3BhcnNlYnVuZGxlABMAAS8AABUAAgAN//8AAAAVAAIADf//AACABdIcHR4fWCRjbGFzc2VzWiRjbGFzc25hbWWjHyAhXU5TTXV0YWJsZURhdGFWTlNEYXRhWE5TT2JqZWN0XxBRLi4vLi4vLi4vUmVzZWFyY2gvU2hhcmVkL0xpdGVyYXR1cmUvVml0dGVyL0Zhc3RlciBtZXRob2RzIGZvciByYW5kb20gc2FtcGxpbmcucGRm0hwdJCWiJSFcTlNEaWN0aW9uYXJ5EgABhqBfEA9OU0tleWVkQXJjaGl2ZXIACAARABYAHwAoADIANQA6ADwARQBLAFIAXQBlAGwAbwBxAHMAdgB4AHoAfACGAJMAmACgBCgEKgQvBDgEQwRHBFUEXARlBLkEvgTBBM4E0wAAAAAAAAIBAAAAAAAAACgAAAAAAAAAAAAAAAAAAATl}, Bdsk-Url-1 = {http://dx.doi.org/10.1145/358105.893}} @article{V85a, Abstract = {{We introduce fast algorithms for selecting a random sample of n records without replacement from a pool of N records, where the value of N is unknown beforehand. The main result of the paper is the design and analysis of Algorithm Z; it does the sampling in one pass using constant space and in O(n(1 + log(N/n))) expected time, which is optimum, up to a constant factor. Several optimizations are studied that collectively improve the speed of the naive version of the algorithm by an order of magnitude. We give an efficient Pascal-like implementation that incorporates these modifications and that is suitable for general use. Theoretical and empirical results indicate that Algorithm Z outperforms current methods by a significant margin.}}, Author = {Vitter, Jeffrey S.}, Date-Added = {2012-03-09 17:05:52 -0800}, Date-Modified = {2012-03-09 17:05:52 -0800}, Doi = {10.1145/3147.3165}, Journal = {ACM Transactions on Mathematical Software}, Keywords = {Sampling}, Number = {1}, Pages = {37--57}, Publisher = {ACM}, Title = {Random sampling with a reservoir}, Volume = {11}, Year = {1985}, Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUIJidUJHRvcFgkb2JqZWN0c1gkdmVyc2lvblkkYXJjaGl2ZXLRBgdUcm9vdIABqAkKFRYXGyIjVSRudWxs0wsMDQ4RFFpOUy5vYmplY3RzV05TLmtleXNWJGNsYXNzog8QgASABqISE4ACgAOAB1lhbGlhc0RhdGFccmVsYXRpdmVQYXRo0hgNGRpXTlMuZGF0YU8RA34AAAAAA34AAgAACFJlc2VhcmNoAAAAAAAAAAAAAAAAAAAAAAAAAMmQMDVIKwAFAAAoxB9SYW5kb20gc2FtcGxpbmcgd2l0aCBhIzI4QzcucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACjHyJbZ/QAAAAAAAAAA/////wAADQIAAAAAAAAAAAAAAAAAAAAGVml0dGVyABAACAAAyZCgtQAAABEACAAAyJc8bQAAAAEADAAAKMQAACfbAAAD4QACAEFSZXNlYXJjaDpTaGFyZWQ6TGl0ZXJhdHVyZTpWaXR0ZXI6UmFuZG9tIHNhbXBsaW5nIHdpdGggYSMyOEM3LnBkZgAADgBKACQAUgBhAG4AZABvAG0AIABzAGEAbQBwAGwAaQBuAGcAIAB3AGkAdABoACAAYQAgAHIAZQBzAGUAcgB2AG8AaQByAC4AcABkAGYADwASAAgAUgBlAHMAZQBhAHIAYwBoABIAPi9TaGFyZWQvTGl0ZXJhdHVyZS9WaXR0ZXIvUmFuZG9tIHNhbXBsaW5nIHdpdGggYSByZXNlcnZvaXIucGRmABMAIC9Vc2Vycy9zY2hvcGYvRG9jdW1lbnRzL1Jlc2VhcmNoABQBmAAAAAABmAACAAEMTWFjaW50b3NoIEhEAAAAAAAAAAAAAAAAAAAAx9Q1YUgrAAAAEPW+FVJlc2VhcmNoLnNwYXJzZWJ1bmRsZQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA7alzJkDArAAAAAAAAAAD/////AAAJIAAAAAAAAAAAAAAAAAAAAAlEb2N1bWVudHMAABAACAAAx9SX0QAAABEACAAAyZCgqwAAAAEADAAQ9b4AEPWxAACS8wACADlNYWNpbnRvc2ggSEQ6VXNlcnM6c2Nob3BmOkRvY3VtZW50czpSZXNlYXJjaC5zcGFyc2VidW5kbGUAAA4ALAAVAFIAZQBzAGUAYQByAGMAaAAuAHMAcABhAHIAcwBlAGIAdQBuAGQAbABlAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIALFVzZXJzL3NjaG9wZi9Eb2N1bWVudHMvUmVzZWFyY2guc3BhcnNlYnVuZGxlABMAAS8AABUAAgAN//8AAAAVAAIADf//AACABdIcHR4fWCRjbGFzc2VzWiRjbGFzc25hbWWjHyAhXU5TTXV0YWJsZURhdGFWTlNEYXRhWE5TT2JqZWN0XxBPLi4vLi4vLi4vUmVzZWFyY2gvU2hhcmVkL0xpdGVyYXR1cmUvVml0dGVyL1JhbmRvbSBzYW1wbGluZyB3aXRoIGEgcmVzZXJ2b2lyLnBkZtIcHSQloiUhXE5TRGljdGlvbmFyeRIAAYagXxAPTlNLZXllZEFyY2hpdmVyAAgAEQAWAB8AKAAyADUAOgA8AEUASwBSAF0AZQBsAG8AcQBzAHYAeAB6AHwAhgCTAJgAoAQiBCQEKQQyBD0EQQRPBFYEXwSxBLYEuQTGBMsAAAAAAAACAQAAAAAAAAAoAAAAAAAAAAAAAAAAAAAE3Q==}, Bdsk-Url-1 = {http://dx.doi.org/10.1145/3147.3165}} @article{ORO05a, Author = {Ogita, Takeshi and Rump, Siegfried M. and Oishi, Shin'ichi}, Date-Added = {2012-03-09 11:02:03 -0800}, Date-Modified = {2012-03-09 17:06:04 -0800}, Doi = {10.1137/030601818}, Journal = {SIAM Journal on Scientific Computing}, Keywords = {Numerical Analysis}, Month = jun, Number = {6}, Pages = {1955--1988}, Publisher = {Society for Industrial and Applied Mathematics}, Title = {Accurate Sum and Dot Product}, Volume = {26}, Year = {2005}} @article{MB83a, Abstract = {{A convenient one-pass algorithm for drawing a simple random sample without replace- ment of size n from a population of N members, when N is initially unknown, is presented. Moreover, even when N is known, this algorithm appears to be more efficient than previously suggested algorithms when the entire population is stored in the fast memory of the computer. Applications to sampling from a computer file and to linear programming are briefly indicated.}}, Author = {McLeod, A. I. and Bellhouse, D. R.}, Date-Added = {2012-03-08 17:17:03 -0800}, Date-Modified = {2012-03-10 23:34:30 -0800}, Eprint = {2347297}, Eprinttype = {jstor}, Journal = {Journal of the Royal Statistical Society. Series C (Applied Statistics)}, Keywords = {Sampling}, Number = {2}, Pages = {182--184}, Publisher = {Blackwell Publishing for the Royal Statistical Society}, Title = {A Convenient Algorithm for Drawing a Simple Random Sample}, Volume = {32}, Year = {1983}, Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUIJidUJHRvcFgkb2JqZWN0c1gkdmVyc2lvblkkYXJjaGl2ZXLRBgdUcm9vdIABqAkKFRYXGyIjVSRudWxs0wsMDQ4RFFpOUy5vYmplY3RzV05TLmtleXNWJGNsYXNzog8QgASABqISE4ACgAOAB1lhbGlhc0RhdGFccmVsYXRpdmVQYXRo0hgNGRpXTlMuZGF0YU8RA+oAAAAAA+oAAgAACFJlc2VhcmNoAAAAAAAAAAAAAAAAAAAAAAAAAMmQMDVIKwAFAAAocx9BIENvbnZlbmllbnQgQWxnb3JpdGhtIzI4NzQucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACh0yJbgtwAAAAAAAAAA/////wAADQIAAAAAAAAAAAAAAAAAAAARTWNMZW9kLCBCZWxsaG91c2UAABAACAAAyZCgtQAAABEACAAAyJdDJwAAAAEADAAAKHMAACfbAAAD4QACAExSZXNlYXJjaDpTaGFyZWQ6TGl0ZXJhdHVyZTpNY0xlb2QsIEJlbGxob3VzZTpBIENvbnZlbmllbnQgQWxnb3JpdGhtIzI4NzQucGRmAA4AfAA9AEEAIABDAG8AbgB2AGUAbgBpAGUAbgB0ACAAQQBsAGcAbwByAGkAdABoAG0AIABmAG8AcgAgAEQAcgBhAHcAaQBuAGcAIABhACAAUwBpAG0AcABsAGUAIABSAGEAbgBkAG8AbQAgAFMAYQBtAHAAbABlAC4AcABkAGYADwASAAgAUgBlAHMAZQBhAHIAYwBoABIAYi9TaGFyZWQvTGl0ZXJhdHVyZS9NY0xlb2QsIEJlbGxob3VzZS9BIENvbnZlbmllbnQgQWxnb3JpdGhtIGZvciBEcmF3aW5nIGEgU2ltcGxlIFJhbmRvbSBTYW1wbGUucGRmABMAIC9Vc2Vycy9zY2hvcGYvRG9jdW1lbnRzL1Jlc2VhcmNoABQBmAAAAAABmAACAAEMTWFjaW50b3NoIEhEAAAAAAAAAAAAAAAAAAAAx9Q1YUgrAAAAEPW+FVJlc2VhcmNoLnNwYXJzZWJ1bmRsZQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA7alzJkDArAAAAAAAAAAD/////AAAJIAAAAAAAAAAAAAAAAAAAAAlEb2N1bWVudHMAABAACAAAx9SX0QAAABEACAAAyZCgqwAAAAEADAAQ9b4AEPWxAACS8wACADlNYWNpbnRvc2ggSEQ6VXNlcnM6c2Nob3BmOkRvY3VtZW50czpSZXNlYXJjaC5zcGFyc2VidW5kbGUAAA4ALAAVAFIAZQBzAGUAYQByAGMAaAAuAHMAcABhAHIAcwBlAGIAdQBuAGQAbABlAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIALFVzZXJzL3NjaG9wZi9Eb2N1bWVudHMvUmVzZWFyY2guc3BhcnNlYnVuZGxlABMAAS8AABUAAgAN//8AAAAVAAIADf//AACABdIcHR4fWCRjbGFzc2VzWiRjbGFzc25hbWWjHyAhXU5TTXV0YWJsZURhdGFWTlNEYXRhWE5TT2JqZWN0XxBzLi4vLi4vLi4vUmVzZWFyY2gvU2hhcmVkL0xpdGVyYXR1cmUvTWNMZW9kLCBCZWxsaG91c2UvQSBDb252ZW5pZW50IEFsZ29yaXRobSBmb3IgRHJhd2luZyBhIFNpbXBsZSBSYW5kb20gU2FtcGxlLnBkZtIcHSQloiUhXE5TRGljdGlvbmFyeRIAAYagXxAPTlNLZXllZEFyY2hpdmVyAAgAEQAWAB8AKAAyADUAOgA8AEUASwBSAF0AZQBsAG8AcQBzAHYAeAB6AHwAhgCTAJgAoASOBJAElQSeBKkErQS7BMIEywVBBUYFSQVWBVsAAAAAAAACAQAAAAAAAAAoAAAAAAAAAAAAAAAAAAAFbQ==}, Bdsk-Url-1 = {http://www.jstor.org/stable/2347297}} @article{C82a, Abstract = {{We present a general purpose unequal probability without replacement sampling plan with fixed sample size. In contrast to existing such plans, our scheme keeps the sample size fixed and lets the population units enter the sample one at a time through a carefully designed random mechanism. Consequently, all high-order inclusion probabilities can be easily computed.}}, Author = {Chao, Min-Te}, Date-Added = {2012-03-08 17:13:26 -0800}, Date-Modified = {2012-03-08 17:17:59 -0800}, Doi = {10.1093/biomet/69.3.653}, Journal = {Biometrika}, Keywords = {Sampling}, Number = {3}, Pages = {653--656}, Title = {A general purpose unequal probability sampling plan}, Volume = {69}, Year = {1982}} @book{CS08a, Author = {Christopher D. Manning and Prabhakar Raghavan and Hinrich Sch{\"u}tze}, Date-Added = {2012-03-07 10:47:43 -0800}, Date-Modified = {2012-03-07 10:48:21 -0800}, Publisher = {Cambridge University Press}, Title = {Introduction to Information Retrieval}, Url = {http://nlp.stanford.edu/IR-book/}, Year = {2008}} @inproceedings{V09a, Abstract = {{The k-means algorithm is a well-known method for partitioning n points that lie in the d-dimensional space into k clusters. Its main features are simplicity and speed in practice. Theoretically, however, the best known upper bound on its running time (i.e. O(nkd)) is, in general, exponential in the number of points (when kd=\Omega(n log n)). Recently, Arthur and Vassilvitskii [2] showed a super-polynomial worst-case analysis, improving the best known lower bound from \Omega(n) to 2\Omega(\sqrt{n}) with a construction in d=\Omega(\sqrt{n}) dimensions. In [2] they also conjectured the existence of super-polynomial lower bounds for any d>=2. Our contribution is twofold: we prove this conjecture and we improve the lower bound, by presenting a simple construction in the plane that leads to the exponential lower bound 2^\Omega(n).}}, Author = {Vattani, Andrea}, Crossref = {:SCG09}, Date-Added = {2012-03-06 15:54:24 -0800}, Date-Modified = {2012-03-08 17:17:37 -0800}, Doi = {10.1145/1542362.1542419}, Keywords = {Clustering}, Pages = {324--332}, Title = {$k$-means requires exponentially many iterations even in the plane}} @online{AMR09a, Abstract = {{The k-means method is one of the most widely used clustering algorithms, drawing its popularity from its speed in practice. Recently, however, it was shown to have exponential worst-case running time. In order to close the gap between practical performance and theoretical analysis, the k-means method has been studied in the model of smoothed analysis. But even the smoothed analyses so far are unsatisfactory as the bounds are still super-polynomial in the number n of data points. In this paper, we settle the smoothed running time of the k-means method. We show that the smoothed number of iterations is bounded by a polynomial in n and 1/\sigma, where \sigma is the standard deviation of the Gaussian perturbations. This means that if an arbitrary input data set is randomly perturbed, then the k-means method will run in expected polynomial time on that input set.}}, Author = {David Arthur and Bodo Manthey and Heiko R{\"o}glin}, Date-Added = {2012-03-06 15:34:08 -0800}, Date-Modified = {2012-03-10 23:48:17 -0800}, Eprint = {0904.1113}, Eprintclass = {cs.DS}, Eprinttype = {arxiv}, Title = {k-Means has Polynomial Smoothed Complexity}, Year = {2009}} @article{L82a, Abstract = {{It has long been realized that in pulse-code modulation (PCM), with a given ensemble of signals to handle, the quantum values should be spaced more closely in the voltage regions where the signal amplitude is more likely to fall. It has been shown by Panter and Dite that, in the limit as the number of quanta becomes infinite, the asymptotic fractional density of quanta per unit voltage should vary as the one-third power of the probability density per unit voltage of signal amplitudes. In this paper the corresponding result for any finite number of quanta is derived; that is, necessary conditions are found that the quanta and associated quantization intervals of an optimum finite quantization scheme must satisfy. The optimization criterion used is that the average quantization noise power be a minimum. It is shown that the result obtained here goes over into the Panter and Dite result as the number of quanta become large. The optimum quautization schemes for2^{b}quanta,b=1,2, cdots, 7, are given numerically for Gaussian and for Laplacian distribution of signal amplitudes.}}, Author = {Stuart Lloyd}, Date-Added = {2012-03-06 15:00:11 -0800}, Date-Modified = {2012-03-06 15:04:10 -0800}, Doi = {10.1109/TIT.1982.1056489}, Journal = {IEEE Transactions on Information Theory}, Month = mar, Note = {Technical Report appeared much earlier in: \emph{Bell Telephone Laboratories Paper} (1957)}, Number = {2}, Pages = {129--137}, Title = {Least squares quantization in PCM}, Volume = {28}, Year = {1982}} @article{MNV10a, Abstract = {{In the k -means problem, we are given a finite set S of points in R^m , and integer k >= 1, and we want to find k points (centers) so as to minimize the sum of the square of the Euclidean distance of each point in S to its nearest center. We show that this well-known problem is NP-hard even for instances in the plane, answering an open question posed by Dasgupta (2007) [7].}}, Author = {Meena Mahajan and Prajakta Nimbhorkar and Kasturi Varadarajan}, Date-Added = {2012-03-06 14:52:27 -0800}, Date-Modified = {2012-03-08 17:17:44 -0800}, Doi = {10.1016/j.tcs.2010.05.034}, Journal = {Theoretical Computer Science}, Keywords = {Clustering}, Month = jun, Note = {In Press.}, Title = {The planar $k$-means problem is {NP}-hard}, Year = {2010}} @article{ADH09a, Abstract = {{A recent proof of NP-hardness of Euclidean sum-of-squares clustering, due to Drineas et al. (Mach. Learn. 56:9--33, 2004), is not valid. An alternate short proof is provided.}}, Author = {Aloise, Daniel and Deshpande, Amit and Hansen, Pierre and Popat, Preyas}, Date-Added = {2012-03-06 14:43:42 -0800}, Date-Modified = {2012-03-06 14:45:02 -0800}, Doi = {10.1007/s10994-009-5103-0}, Journal = {Machine Learning}, Pages = {245-248}, Title = {{NP}-hardness of {Euclidean} sum-of-squares clustering}, Volume = {75}, Year = {2009}} @book{F68a, Author = {William Feller}, Date-Added = {2012-01-31 11:53:21 -0800}, Date-Modified = {2012-01-31 11:53:42 -0800}, Edition = {3rd}, Publisher = {Wiley}, Title = {An Introduction to Probability Theory and Its Applications}, Year = {1968}} @inproceedings{MNU00a, Author = {Andrew McCallum and Kamal Nigam and Lyle H. Ungar}, Crossref = {:KDD00}, Date-Added = {2012-01-30 22:34:21 -0800}, Date-Modified = {2012-03-10 23:31:44 -0800}, Doi = {10.1145/347090.347123}, Keywords = {Clustering}, Pages = {169--178}, Title = {Efficient clustering of high-dimensional data sets with application to reference matching}, Year = {2000}, Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUIJidUJHRvcFgkb2JqZWN0c1gkdmVyc2lvblkkYXJjaGl2ZXLRBgdUcm9vdIABqAkKFRYXGyIjVSRudWxs0wsMDQ4RFFpOUy5vYmplY3RzV05TLmtleXNWJGNsYXNzog8QgASABqISE4ACgAOAB1lhbGlhc0RhdGFccmVsYXRpdmVQYXRo0hgNGRpXTlMuZGF0YU8RAuYAAAAAAuYAAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAMfUNWFIKwAAAPVN/R9FZmZpY2llbnQgY2x1c3RlcmluZyNGNTRERjAucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA9U3wy0zLzAAAAAAAAAAAAAMABAAACSAAAAAAAAAAAAAAAAAAAAAWTWNDYWxsdW0sIE5pZ2FtLCBVbmdhcgAQAAgAAMfUl9EAAAARAAgAAMtNPEwAAAABABgA9U39APVNSwDyIY0AEPW+ABD1sQAAkvMAAgBtTWFjaW50b3NoIEhEOlVzZXJzOnNjaG9wZjpEb2N1bWVudHM6ay1tZWFuczpMaXRlcmF0dXJlOk1jQ2FsbHVtLCBOaWdhbSwgVW5nYXI6RWZmaWNpZW50IGNsdXN0ZXJpbmcjRjU0REYwLnBkZgAADgC8AF0ARQBmAGYAaQBjAGkAZQBuAHQAIABjAGwAdQBzAHQAZQByAGkAbgBnACAAbwBmACAAaABpAGcAaAAtAGQAaQBtAGUAbgBzAGkAbwBuAGEAbAAgAGQAYQB0AGEAIABzAGUAdABzACAAdwBpAHQAaAAgAGEAcABwAGwAaQBjAGEAdABpAG8AbgAgAHQAbwAgAHIAZQBmAGUAcgBlAG4AYwBlACAAbQBhAHQAYwBoAGkAbgBnAC4AcABkAGYADwAaAAwATQBhAGMAaQBuAHQAbwBzAGgAIABIAEQAEgCeVXNlcnMvc2Nob3BmL0RvY3VtZW50cy9rLW1lYW5zL0xpdGVyYXR1cmUvTWNDYWxsdW0sIE5pZ2FtLCBVbmdhci9FZmZpY2llbnQgY2x1c3RlcmluZyBvZiBoaWdoLWRpbWVuc2lvbmFsIGRhdGEgc2V0cyB3aXRoIGFwcGxpY2F0aW9uIHRvIHJlZmVyZW5jZSBtYXRjaGluZy5wZGYAEwABLwAAFQACAA3//wAAgAXSHB0eH1gkY2xhc3Nlc1okY2xhc3NuYW1lox8gIV1OU011dGFibGVEYXRhVk5TRGF0YVhOU09iamVjdF8QkC4uLy4uLy4uL2stbWVhbnMvTGl0ZXJhdHVyZS9NY0NhbGx1bSwgTmlnYW0sIFVuZ2FyL0VmZmljaWVudCBjbHVzdGVyaW5nIG9mIGhpZ2gtZGltZW5zaW9uYWwgZGF0YSBzZXRzIHdpdGggYXBwbGljYXRpb24gdG8gcmVmZXJlbmNlIG1hdGNoaW5nLnBkZtIcHSQloiUhXE5TRGljdGlvbmFyeRIAAYagXxAPTlNLZXllZEFyY2hpdmVyAAgAEQAWAB8AKAAyADUAOgA8AEUASwBSAF0AZQBsAG8AcQBzAHYAeAB6AHwAhgCTAJgAoAOKA4wDkQOaA6UDqQO3A74DxwRaBF8EYgRvBHQAAAAAAAACAQAAAAAAAAAoAAAAAAAAAAAAAAAAAAAEhg==}} @proceedings{:KDD00, Booktitle = kdd00, Date-Added = {2012-01-30 22:33:08 -0800}, Date-Modified = {2012-01-30 22:33:34 -0800}, Title = kdd00, Year = {2000}} @inproceedings{AV07a, Author = {David Arthur and Sergei Vassilvitskii}, Crossref = {:SODA07}, Date-Added = {2012-01-30 22:05:40 -0800}, Date-Modified = {2012-03-11 03:00:57 -0700}, Eprint = {1283494}, Eprinttype = {acm}, Pages = {1027--1035}, Title = {k-means++: the advantages of careful seeding}, Year = {2007}, Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUIJidUJHRvcFgkb2JqZWN0c1gkdmVyc2lvblkkYXJjaGl2ZXLRBgdUcm9vdIABqAkKFRYXGyIjVSRudWxs0wsMDQ4RFFpOUy5vYmplY3RzV05TLmtleXNWJGNsYXNzog8QgASABqISE4ACgAOAB1lhbGlhc0RhdGFccmVsYXRpdmVQYXRo0hgNGRpXTlMuZGF0YU8RAloAAAAAAloAAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAMfUNWFIKwAAAPVNTB9rLW1lYW5zKysgdGhlIGFkdmFudCNGNTREMTIucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA9U0Sy0zJnQAAAAAAAAAAAAMABAAACSAAAAAAAAAAAAAAAAAAAAAVQXJ0aHVyLCBWYXNzaWx2aXRza2lpAAAQAAgAAMfUl9EAAAARAAgAAMtNOh0AAAABABgA9U1MAPVNSwDyIY0AEPW+ABD1sQAAkvMAAgBsTWFjaW50b3NoIEhEOlVzZXJzOnNjaG9wZjpEb2N1bWVudHM6ay1tZWFuczpMaXRlcmF0dXJlOkFydGh1ciwgVmFzc2lsdml0c2tpaTprLW1lYW5zKysgdGhlIGFkdmFudCNGNTREMTIucGRmAA4AYAAvAGsALQBtAGUAYQBuAHMAKwArACAAdABoAGUAIABhAGQAdgBhAG4AdABhAGcAZQBzACAAbwBmACAAYwBhAHIAZQBmAHUAbAAgAHMAZQBlAGQAaQBuAGcALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAG9Vc2Vycy9zY2hvcGYvRG9jdW1lbnRzL2stbWVhbnMvTGl0ZXJhdHVyZS9BcnRodXIsIFZhc3NpbHZpdHNraWkvay1tZWFucysrIHRoZSBhZHZhbnRhZ2VzIG9mIGNhcmVmdWwgc2VlZGluZy5wZGYAABMAAS8AABUAAgAN//8AAIAF0hwdHh9YJGNsYXNzZXNaJGNsYXNzbmFtZaMfICFdTlNNdXRhYmxlRGF0YVZOU0RhdGFYTlNPYmplY3RfEGEuLi8uLi8uLi9rLW1lYW5zL0xpdGVyYXR1cmUvQXJ0aHVyLCBWYXNzaWx2aXRza2lpL2stbWVhbnMrKyB0aGUgYWR2YW50YWdlcyBvZiBjYXJlZnVsIHNlZWRpbmcucGRm0hwdJCWiJSFcTlNEaWN0aW9uYXJ5EgABhqBfEA9OU0tleWVkQXJjaGl2ZXIACAARABYAHwAoADIANQA6ADwARQBLAFIAXQBlAGwAbwBxAHMAdgB4AHoAfACGAJMAmACgAv4DAAMFAw4DGQMdAysDMgM7A58DpAOnA7QDuQAAAAAAAAIBAAAAAAAAACgAAAAAAAAAAAAAAAAAAAPL}} @proceedings{:SODA07, Booktitle = soda07, Date-Added = {2012-01-30 22:11:03 -0800}, Date-Modified = {2012-01-30 22:11:27 -0800}, Title = soda07, Year = {2007}} @proceedings{:SCG09, Booktitle = scg09, Date-Added = {2012-03-06 15:59:03 -0800}, Date-Modified = {2012-03-06 15:59:48 -0800}, Title = scg09, Year = {2009}} @inproceedings{DBLP:conf/icml/SrebroJ03, Author = {Nathan Srebro and Tommi Jaakkola}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {ICML}, Crossref = {DBLP:conf/icml/2003}, Pages = {720-727}, Title = {Weighted Low-Rank Approximations}, Year = {2003}} @proceedings{DBLP:conf/icml/2003, Bibsource = {DBLP, http://dblp.uni-trier.de}, Editor = {Tom Fawcett and Nina Mishra}, Isbn = {1-57735-189-4}, Publisher = {AAAI Press}, Title = {Machine Learning, Proceedings of the Twentieth International Conference (ICML 2003), August 21-24, 2003, Washington, DC, USA}, Year = {2003}} @inproceedings{:TheNetflixPrize07, Author = {James Bennett and Stan Lanning}, Crossref = {:KDDCup07}, Date-Added = {2012-06-12 15:27:25}, Eprinttype = {acm}, Title = {The Netflix Prize}, Year = {2007}} @proceedings{:KDDCup07, Booktitle = {KDD Cup and Workshop}, Date-Added = {2012-06-12 15:27:25}, Title = {KDD Cup and Workshop}, Year = {2007}} @inproceedings{DBLP:conf/sigmod/FengKRR12, Author = {Xixuan Feng and Arun Kumar and Benjamin Recht and Christopher R{\'e}}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {SIGMOD Conference}, Crossref = {DBLP:conf/sigmod/2012}, Ee = {http://doi.acm.org/10.1145/2213836.2213874}, Pages = {325-336}, Title = {Towards a unified architecture for in-RDBMS analytics}, Year = {2012}} @proceedings{DBLP:conf/sigmod/2012, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {SIGMOD Conference}, Editor = {K. Sel\c{c}uk Candan and Yi Chen and Richard T. Snodgrass and Luis Gravano and Ariel Fuxman}, Ee = {http://dl.acm.org/citation.cfm?id=2213836}, Isbn = {978-1-4503-1247-9}, Publisher = {ACM}, Title = {Proceedings of the ACM SIGMOD International Conference on Management of Data, SIGMOD 2012, Scottsdale, AZ, USA, May 20-24, 2012}, Year = {2012}} @article{DBLP:journals/siamrev/RechtFP10, Author = {Benjamin Recht and Maryam Fazel and Pablo A. Parrilo}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Ee = {http://dx.doi.org/10.1137/070697835}, Journal = {SIAM Review}, Number = {3}, Pages = {471-501}, Title = {Guaranteed Minimum-Rank Solutions of Linear Matrix Equations via Nuclear Norm Minimization}, Volume = {52}, Year = {2010}} @article{DBLP:journals/cacm/CandesR12, Author = {Emmanuel J. Cand{\`e}s and Benjamin Recht}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Ee = {http://doi.acm.org/10.1145/2184319.2184343}, Journal = {Commun. ACM}, Number = {6}, Pages = {111-119}, Title = {Exact matrix completion via convex optimization}, Volume = {55}, Year = {2012}} @inproceedings{DBLP:conf/kdd/GemullaNHS11, Author = {Rainer Gemulla and Erik Nijkamp and Peter J. Haas and Yannis Sismanis}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {KDD}, Crossref = {DBLP:conf/kdd/2011}, Ee = {http://doi.acm.org/10.1145/2020408.2020426}, Pages = {69-77}, Title = {Large-scale matrix factorization with distributed stochastic gradient descent}, Year = {2011}} @proceedings{DBLP:conf/kdd/2011, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {KDD}, Editor = {Chid Apt{\'e} and Joydeep Ghosh and Padhraic Smyth}, Isbn = {978-1-4503-0813-7}, Publisher = {ACM}, Title = {Proceedings of the 17th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, San Diego, CA, USA, August 21-24, 2011}, Year = {2011}} @inproceedings{DBLP:conf/nips/DuchiAW10, Author = {John C. Duchi and Alekh Agarwal and Martin J. Wainwright}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {NIPS}, Crossref = {DBLP:conf/nips/2010}, Ee = {http://books.nips.cc/papers/files/nips23/NIPS2010_0423.pdf}, Pages = {550-558}, Title = {Distributed Dual Averaging In Networks}, Year = {2010}} @proceedings{DBLP:conf/nips/2010, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {NIPS}, Editor = {John D. Lafferty and Christopher K. I. Williams and John Shawe-Taylor and Richard S. Zemel and Aron Culotta}, Publisher = {Curran Associates, Inc.}, Title = {Advances in Neural Information Processing Systems 23: 24th Annual Conference on Neural Information Processing Systems 2010. Proceedings of a meeting held 6-9 December 2010, Vancouver, British Columbia, Canada}, Year = {2010}} @inproceedings{DBLP:conf/nips/WrightGRPM09, Author = {John Wright and Arvind Ganesh and Shankar Rao and YiGang Peng and Yi Ma}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {NIPS}, Crossref = {DBLP:conf/nips/2009}, Ee = {http://books.nips.cc/papers/files/nips22/NIPS2009_0116.pdf}, Pages = {2080-2088}, Title = {Robust Principal Component Analysis: Exact Recovery of Corrupted Low-Rank Matrices via Convex Optimization}, Year = {2009}} @proceedings{DBLP:conf/nips/2009, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {NIPS}, Editor = {Yoshua Bengio and Dale Schuurmans and John D. Lafferty and Christopher K. I. Williams and Aron Culotta}, Isbn = {9781615679119}, Publisher = {Curran Associates, Inc.}, Title = {Advances in Neural Information Processing Systems 22: 23rd Annual Conference on Neural Information Processing Systems 2009. Proceedings of a meeting held 7-10 December 2009, Vancouver, British Columbia, Canada}, Year = {2009}} @article{springerlink:10.1007/s10107-011-0472-0, Affiliation = {Department of Electrical Engineering and Computer Science, Laboratory for Information and Decision Systems, M.I.T., Mass, Cambridge, MA 02139, USA}, Author = {Bertsekas, Dimitri}, Date-Modified = {2012-08-02 18:08:19 +0000}, Doi = {10.1007/s10107-011-0472-0}, Issn = {0025-5610}, Issue = {2}, Journal = {Mathematical Programming}, Keyword = {Mathematics and Statistics}, Pages = {163--195}, Publisher = {Springer Berlin / Heidelberg}, Title = {Incremental proximal methods for large scale convex optimization}, Volume = {129}, Year = {2011}} @book{nocedal2006numerical, Author = {Nocedal, J. and Wright, S.J.}, Isbn = {9780387303031}, Lccn = {2006923897}, Publisher = {Springer}, Series = {Springer series in operations research}, Title = {Numerical optimization}, Url = {http://books.google.com/books?id=eNlPAAAAMAAJ}, Year = {2006}} @inproceedings{DBLP:conf/nips/BottouB07, Author = {L{\'e}on Bottou and Olivier Bousquet}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {NIPS}, Crossref = {DBLP:conf/nips/2007}, Ee = {http://books.nips.cc/papers/files/nips20/NIPS2007_0726.pdf}, Title = {The Tradeoffs of Large Scale Learning}, Year = {2007}} @proceedings{DBLP:conf/nips/2007, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {NIPS}, Editor = {John C. Platt and Daphne Koller and Yoram Singer and Sam T. Roweis}, Publisher = {Curran Associates, Inc.}, Title = {Advances in Neural Information Processing Systems 20, Proceedings of the Twenty-First Annual Conference on Neural Information Processing Systems, Vancouver, British Columbia, Canada, December 3-6, 2007}, Year = {2008}} @book{bertsekas1999nonlinear, Author = {Bertsekas, D.P.}, Isbn = {9781886529007}, Lccn = {95081359}, Publisher = {Athena Scientific}, Series = {Athena scientific optimization and computation series}, Title = {Nonlinear programming}, Url = {http://books.google.com/books?id=TgMpAQAAMAAJ}, Year = {1999}} @article{hager2006survey, Author = {Hager, W. W. and Zhang, H.}, Citeulike-Article-Id = {7511569}, Journal = {Pacific journal of Optimization}, Keywords = {bibliot-11-08-07}, Number = {1}, Pages = {35--58}, Posted-At = {2010-07-19 10:35:37}, Priority = {2}, Publisher = {Citeseer}, Title = {{A survey of nonlinear conjugate gradient methods}}, Volume = {2}, Year = {2006}} @article{Cox1972, Author = {Cox, D.R.}, Title= {Regression models and life-tables}, Year = {1972}, Journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, Volume = {34}, Number = {2}, Pages = {187-220},} @proceedings{DBLP:conf/icml/2001, editor = {Carla E. Brodley and Andrea Pohoreckyj Danyluk}, title = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML 2001), Williams College, Williamstown, MA, USA, June 28 - July 1, 2001}, publisher = {Morgan Kaufmann}, year = {2001}, isbn = {1-55860-778-1}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{DBLP:conf/icml/LaffertyMP01, author = {John D. Lafferty and Andrew McCallum and Fernando C. N. Pereira}, title = {Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data}, booktitle = {ICML}, year = {2001}, pages = {282-289}, bibsource = {DBLP, http://dblp.uni-trier.de} } @article{DBLP:journals/scholarpedia/Viterbi09, author = {Andrew J. Viterbi}, title = {Viterbi algorithm}, journal = {Scholarpedia}, volume = {4}, number = {1}, year = {2009}, pages = {6246}, ee = {http://www.scholarpedia.org/article/Viterbi_algorithm}, bibsource = {DBLP, http://dblp.uni-trier.de} } @article{DBLP:journals/siamjo/MoralesN00, author = {Jos{\'e} Luis Morales and Jorge Nocedal}, title = {Automatic Preconditioning by Limited Memory Quasi-Newton Updating}, journal = {SIAM Journal on Optimization}, volume = {10}, number = {4}, year = {2000}, pages = {1079-1096}, ee = {http://dx.doi.org/10.1137/S1052623497327854}, bibsource = {DBLP, http://dblp.uni-trier.de} } @article{DBLP:journals/coling/DeRose88, author = {Steven J. DeRose}, title = {Grammatical Category Disambiguation by Statistical Optimization}, journal = {Computational Linguistics}, volume = {14}, number = {1}, year = {1988}, pages = {31-39}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{DBLP:conf/naacl/ShaP03, author = {Fei Sha and Fernando C. N. Pereira}, title = {Shallow Parsing with Conditional Random Fields}, booktitle = {HLT-NAACL}, year = {2003}, ee = {http://acl.ldc.upenn.edu/N/N03/N03-1028.pdf}, bibsource = {DBLP, http://dblp.uni-trier.de} } @article{DBLP:journals/coling/MarcusSM94, author = {Mitchell P. Marcus and Beatrice Santorini and Mary Ann Marcinkiewicz}, title = {Building a Large Annotated Corpus of English: The Penn Treebank}, journal = {Computational Linguistics}, volume = {19}, number = {2}, year = {1993}, pages = {313-330}, bibsource = {DBLP, http://dblp.uni-trier.de} } @techreport{armadillo, Author = {Conrad Sanderson}, Institution = {NICTA}, Title = {Armadillo: An Open Source {C++} Linear Algebra Library for Fast Prototyping and Computationally Intensive Experiments}, Year = {2010}} @misc{eigen, Author = {Ga\"{e}l Guennebaud and Beno\^{i}t Jacob and others}, Title = {Eigen v3}, Url = {http://eigen.tuxfamily.org}, Year = {2010} }