elife00638.xml

<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.1d1 20130915//EN"  "JATS-archivearticle1.dtd"><article article-type="discussion" dtd-version="1.1d1" xmlns:xlink="http://www.w3.org/1999/xlink"><front><journal-meta><journal-id journal-id-type="nlm-ta">eLife</journal-id><journal-id journal-id-type="hwp">elife</journal-id><journal-id journal-id-type="publisher-id">eLife</journal-id><journal-title-group><journal-title>eLife</journal-title></journal-title-group><issn publication-format="electronic">2050-084X</issn><publisher><publisher-name>eLife Sciences Publications, Ltd</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">00638</article-id><article-id pub-id-type="doi">10.7554/eLife.00638</article-id><article-categories><subj-group subj-group-type="display-channel"><subject>Feature article</subject></subj-group><subj-group subj-group-type="sub-display-channel"><subject>Tutorial</subject></subj-group></article-categories><title-group><article-title>How to draw the line in biomedical research</article-title></title-group><contrib-group><contrib contrib-type="author" id="author-4290"><name><surname>Huang</surname><given-names>Lisa</given-names></name><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="conf1"/><x> is in the </x></contrib><contrib contrib-type="author" id="author-4291"><name><surname>Rattner</surname><given-names>Amir</given-names></name><xref ref-type="aff" rid="aff2"/><xref ref-type="fn" rid="conf1"/><x> is in the </x></contrib><contrib contrib-type="author" id="author-4292"><name><surname>Liu</surname><given-names>Han</given-names></name><xref ref-type="aff" rid="aff3"/><xref ref-type="fn" rid="conf1"/><x> is in the </x></contrib><contrib contrib-type="author" corresp="yes" id="author-1147"><name><surname>Nathans</surname><given-names>Jeremy</given-names></name><xref ref-type="aff" rid="aff4"/><xref ref-type="fn" rid="conf1"/><x> is a </x><role>Reviewing Editor</role><x>, and is in the </x></contrib><aff id="aff1"><institution content-type="dept">Department of Molecular Biology and Genetics</institution>, <institution>Johns Hopkins University School of Medicine</institution>, <addr-line><named-content content-type="city">Baltimore</named-content></addr-line>, <country>United States</country> and the <institution content-type="dept">Department of Biostatistics</institution>, <institution>Johns Hopkins Bloomberg School of Public Health</institution>, <addr-line><named-content content-type="city">Baltimore</named-content></addr-line>, <country>United States</country> <email>chuang36@jhmi.edu</email></aff><aff id="aff2"><institution content-type="dept">Department of Molecular Biology and Genetics</institution>, <institution>Johns Hopkins University School of Medicine</institution>, <addr-line><named-content content-type="city">Baltimore</named-content></addr-line>, <country>United States</country> <email>arattner@jhmi.edu</email></aff><aff id="aff3"><institution content-type="dept">Department of Biostatistics</institution>, <institution>Johns Hopkins Bloomberg School of Public Health</institution>, <addr-line><named-content content-type="city">Baltimore</named-content></addr-line>, <country>United States</country> and the <institution content-type="dept">Department of Computer Science</institution>, <institution>Johns Hopkins University</institution>, <addr-line><named-content content-type="city">Baltimore</named-content></addr-line>, <country>United States</country>; Current address: Department of Operations Research, Princeton University, Princeton, United States <email>hanliu@princeton.edu</email></aff><aff id="aff4"><institution content-type="dept">Department of Molecular Biology and Genetics, the Department of Neuroscience, the Department of Ophthalmology</institution> and the <institution>Howard Hughes Medical Institute, Johns Hopkins University School of Medicine</institution>, <addr-line><named-content content-type="city">Baltimore</named-content></addr-line>, <country>United States</country> <email>jnathans@jhmi.edu</email></aff></contrib-group><pub-date date-type="pub" publication-format="electronic"><day>19</day><month>03</month><year>2013</year></pub-date><pub-date pub-type="collection"><year>2013</year></pub-date><volume>2</volume><elocation-id>e00638</elocation-id><permissions><copyright-statement>© 2013, Huang et al</copyright-statement><copyright-year>2013</copyright-year><copyright-holder>Huang et al</copyright-holder><license xlink:href="http://creativecommons.org/licenses/by/3.0/"><license-p>This article is distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/3.0/">Creative Commons Attribution License</ext-link>, which permits unrestricted use and redistribution provided that the original author and source are credited.</license-p></license></permissions><self-uri content-type="pdf" xlink:href="elife00638.pdf"/><abstract><p>The use of the least squares method to calculate the best-fitting line through a two-dimensional scatter plot typically requires the user to assume that one of the variables depends on the other. However, in many cases the relationship between the two variables is more complex, and it is not valid to say that one variable is independent and the other is dependent. When analysing such data researchers should consider plotting the three regression lines that can be calculated for any two-dimensional scatter plot.</p></abstract><kwd-group kwd-group-type="author-keywords"><title>Author keywords</title><kwd>Tutorial</kwd><kwd>statistics</kwd><kwd>publishing</kwd></kwd-group><custom-meta-group><custom-meta><meta-name>elife-xml-version</meta-name><meta-value>2</meta-value></custom-meta></custom-meta-group></article-meta></front><body><p>Biomedical research relies on statistical analyses of data sets comprised of multiple variables and, in particular, on analyses of the relationships between pairs of variables within those data sets. In a typical analysis, data representing two variables are displayed in a two-dimensional scatter plot and the method of ordinary least squares is used to fit a regression line to the data. Here we examine an under-appreciated aspect of this approach: the slope of the regression line depends on which of the two variables we select as the independent variable. This means that the method of ordinary least squares can be used to calculate two different regression lines for the same scatter plot. While this issue has long been appreciated in the statistics community, it is not as widely known among biomedical researchers (<xref ref-type="bibr" rid="bib2">Cornbleet and Gochman, 1979</xref>). The ubiquity of scatter plots and regression lines in biomedical research suggests that a brief discussion of this issue would be useful.</p><p>Consider a data set composed of pairs of variables, with individual data points represented by (x<sub>1</sub>, y<sub>1</sub>), (x<sub>2</sub>, y<sub>2</sub>), and so on. In some studies, there may be a symmetric relationship between x<sub>i</sub> and y<sub>i</sub>: for example, they might represent blood pressure measurements from pairs of siblings in a cohort study. Alternatively, there may be an asymmetry in the relationship between the variables: for example, x<sub>i</sub> might represent the dose of an antihypertensive drug, and y<sub>i</sub> might represent the change in blood pressure in a group of subjects treated with various doses of the drug. In this example, drug dose is the independent variable and change in blood pressure is the dependent variable.</p><p>Typically, one is interested in determining the most likely value of the dependent variable given the value of the independent one. Thus, in the example described above one might be interested in predicting the change in blood pressure in response to different doses of the drug. However, there are many instances in which it is not clear that one variable depends on the other (independent) variable. For example, individuals with metabolic syndrome have elevated levels of both serum triglyceride and elevated fasting glucose levels: therefore, in a cohort that contains both metabolic syndrome patients and control subjects, one would expect these two variables to be correlated, with a clustering of metabolic syndrome patients at the high end of both distributions (<xref ref-type="bibr" rid="bib4">Ford et al., 2002</xref>). Although it would be inappropriate to consider one of these variables independent and the other dependent in a mechanistic sense, one might still be interested in calculating the expected level of serum triglycerides given the level of fasting glucose, or the expected level of fasting glucose given the level of serum triglycerides. As described below, estimations in either direction begin with the calculation of a best-fitting regression line.</p><sec id="s1"><title>The method of least squares</title><p>The straight line that constitutes the best fit to a set of data points in the x-y plane is typically calculated by minimizing the sum of the squares of the distances from the points to the line—a method that was introduced by Legendre and Gauss more than two hundred years ago. If one variable, conventionally represented by the y-axis, is known to depend on the other variable, conventionally represented by the x-axis, then one generally calculates a best-fitting line that represents the expected value of the dependent variable (y) as a function of the independent variable (x): this is known as the regression of y on x. In this case, the distance from a data point to the regression line (also known as the residual) is taken as the <italic>vertical</italic> distance from the point to the line (<xref ref-type="fig" rid="fig1">Figure 1A</xref>; <xref ref-type="bibr" rid="bib1">Bulmer, 1965</xref>). This approach, referred to as ordinary least squares regression, is the default mode for line fitting in several commonly used software packages; for example, it is the algorithm represented by the ‘Trend line' and ‘Linest' functions in Microsoft Excel.<fig id="fig1" position="float"><label>Figure 1.</label><caption><title>Different types of best-fitting straight lines.</title><p>These graphs show the best-fitting straight lines through the same five data points as calculated by minimizing the sum of the squares of the vertical residuals, which assumes that x is the independent variable (<bold>A</bold>); horizontal residuals, which assumes that y is the independent variable (<bold>B</bold>); and perpendicular residuals which involves no assumptions about the variables (<bold>C</bold>).</p></caption><graphic xlink:href="elife00638f001"/></fig></p><p>If, on the other hand, the y-axis represents the independent variable and the x-axis represents the dependent variable, the best-fitting line can be calculated by taking the residuals as the <italic>horizontal</italic> distances from the points to the line, the regression of x on y (<xref ref-type="fig" rid="fig1">Figure 1B</xref>). Except in the limiting case in which all of the data points lie precisely on a straight line, these two best-fitting regression lines will not coincide.</p><p>A third type of best-fitting line can be calculated by squaring the <italic>perpendicular</italic> distances from the points to the line (<xref ref-type="fig" rid="fig1">Figure 1C</xref>). This method is referred to as an orthogonal or Deming regression. The latter name refers to the statistician W Edward Deming who described the method in the 1940s (<xref ref-type="bibr" rid="bib3">Deming, 1943</xref>). The Deming regression method is symmetric with respect to the two variables and therefore makes no assumptions regarding dependence and independence (<xref ref-type="bibr" rid="bib2">Cornbleet and Gochman, 1979</xref>; <xref ref-type="bibr" rid="bib6">Linnet, 1993</xref>, <xref ref-type="bibr" rid="bib7">1998</xref>; <xref ref-type="bibr" rid="bib5">Glaister, 2001</xref>).</p><p>For a scatter plot in which the data points do not fall on a straight line, the best-fitting line calculated with vertical residuals will have a relatively shallow slope (<xref ref-type="fig" rid="fig1">Figure 1A</xref>), whereas the best-fitting line calculated with horizontal residuals will have a steeper slope (<xref ref-type="fig" rid="fig1">Figure 1B</xref>). The best-fitting line calculated with perpendicular residuals will have an intermediate slope (<xref ref-type="fig" rid="fig1">Figure 1C</xref>). The data can also be described by a correlation coefficient (R) that is agnostic with respect to the dependence or independence of the variables (<xref ref-type="bibr" rid="bib1">Bulmer, 1965</xref>). It is conventional to calculate the square of the correlation coefficient (R<sup>2</sup>), which is equal to the slope of the regression line for y on x (that is, calculated using vertical residuals) divided by the slope of the regression line for x on y (horizontal residuals). Thus, if the data points fall on a straight line (R<sup>2</sup> = 1), the slopes of these two best-fitting lines will be equal. With increasing scatter in the data (R<sup>2</sup> &lt; 1), the slopes of the two best-fitting lines will diverge. If the data points are completely uncorrelated (R<sup>2</sup> = 0), then the best-fitting lines calculated with vertical and horizontal residuals will have slopes of 0 and infinity, respectively.</p></sec><sec id="s2"><title>Examples with real data: three lines are better than one</title><p>These considerations are illustrated by three scatter plots in a paper on post-traumatic stress disorder by Kerry Ressler and co-workers (<xref ref-type="bibr" rid="bib8">Ressler et al., 2011</xref>). The best-fitting lines for these three plots were calculated with vertical residuals (green lines in <xref ref-type="fig" rid="fig2">Figure 2</xref>). In each case, these differ substantially from the best-fitting lines calculated with horizontal residuals (blue lines), as expected from the relatively low values of R<sup>2</sup> for the three data sets; the best-fitting lines calculated with perpendicular residuals (red lines) occupy intermediate positions.<fig id="fig2" position="float"><label>Figure 2.</label><caption><title>Best-fitting straight lines for three data sets reported by Ressler and co-workers (<xref ref-type="bibr" rid="bib8">Ressler et al., 2011</xref>).</title><p>For each of these data sets, best-fitting lines have been calculated by minimizing the sum of the squares of vertical residuals (green), horizontal residuals (blue) or perpendicular residuals (red). The variables in each data set are explained in the text; the data are taken from Figures 1a (<bold>A</bold>), 4a (<bold>B</bold>) and 4c (<bold>C</bold>) in Ressler et al. The agreement between the three lines is relatively poor, as expected from the low values of R<sup>2</sup>, where R is the correlation coefficient. The orthogonal or Deming regression shown by the red lines is not available in Microsoft Excel, but it can be calculated with Excel add-in freeware provided by Jon Peltier (<ext-link ext-link-type="uri" xlink:href="http://peltiertech.com/WordPress/deming-regression-utility">peltiertech.com/WordPress/deming-regression-utility</ext-link>), with the “r” statistics package (<ext-link ext-link-type="uri" xlink:href="http://www.r-project.org">www.r-project.org</ext-link>), and with various commercial software packages including Analyse-it (<ext-link ext-link-type="uri" xlink:href="http://analyse-it.com">analyse-it.com</ext-link>) and MedCalc (<ext-link ext-link-type="uri" xlink:href="http://www.medcalc.org/">www.medcalc.org/</ext-link>).</p></caption><graphic xlink:href="elife00638f002"/></fig></p><p>As noted above, the choice of algorithm to use in calculating the best-fitting line reflects a decision regarding which variable is independent and which is dependent. In Ressler et al. the variables in the three scatter plots are: severity of post-traumatic stress disorder (PTSD) symptoms vs. serum concentration of pituitary adenylate cyclase activating polypeptide (PACAP, also known as ADCYAP1; <xref ref-type="fig" rid="fig2">Figure 2A</xref>); severity of PTSD symptoms vs. PACAP receptor gene methylation (also known as ADCYAP1R; <xref ref-type="fig" rid="fig2">Figure 2B</xref>); and abundance of PACAP receptor mRNA in the cerebral cortex vs. abundance of PACAP mRNA in the cerebral cortex (<xref ref-type="fig" rid="fig2">Figure 2C</xref>). In this example, as in many biological systems, the cause and effect relationships between the variables are likely to be complex. For example, while it is possible that PACAP secretion may alter the severity of PTSD symptoms, it is also possible that stress associated with PTSD may alter PACAP secretion. Furthermore, it is possible that these two variables may have no direct cause-and-effect relationship, and that changes in neural circuitry alter stress level and PACAP secretion by distinct mechanisms. A similar argument can be applied to the other pairs of variables.</p><p>Visual inspection of <xref ref-type="fig" rid="fig2">Figure 2</xref> shows that the individual regression lines for y on x (green) or x on y (blue) do not fully capture the trend in the data points within any of the scatter plots. Plotting both regression lines gives a fuller picture of the data, and comparing their slopes provides a simple graphical assessment of the correlation coefficient. Plotting the orthogonal regression line (red) provides additional information because it makes no assumptions about the dependence or independence of the variables; as such, it appears to more accurately describe the trend in the data compared to either of the ordinary least squares regression lines.</p></sec><sec id="s3" sec-type="conclusion"><title>Conclusion</title><p>The ordinary least squares method is well suited to the analysis of data sets in which one variable influences or predicts the value of a second variable. In biological systems, where causal relationships between variables are often complex, deciding that one variable depends on the other may be somewhat arbitrary. Moreover, even when a causal chain appears to be well established mechanistically, feedback regulation at the molecular, cellular or organ system level can undermine simple models of dependence and independence. Therefore, we would like to suggest that unless the data analysis calls exclusively for a regression of y on x (or x on y), scatter plots should be presented with three best-fitting lines–calculated with horizontal, vertical and perpendicular residuals–to facilitate a more balanced assessment of trends in the data. Given the ubiquity of the ordinary least squares method in the analysis of two-dimensional scatter plots, this small change in the standard approach to data presentation should prove useful across the full range of biomedical sciences.</p></sec></body><back><ack id="ack"><title>Acknowledgement</title><p>We thank Kerry Ressler for generously providing us with his data sets and for giving us permission to re-analyze his data.</p></ack><fn-group content-type="competing-interest"><fn fn-type="conflict" id="conf1"><label>Competing interests:</label><p>The authors declare that no competing interests exist.</p></fn></fn-group><ref-list><title>References</title><ref id="bib1"><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Bulmer</surname><given-names>MG</given-names></name></person-group><year>1965</year><source>Principles of statistics</source><publisher-loc>New York</publisher-loc><publisher-name>Dover Publications</publisher-name></element-citation></ref><ref id="bib2"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cornbleet</surname><given-names>PJ</given-names></name><name><surname>Gochman</surname><given-names>N</given-names></name></person-group><year>1979</year><article-title>Incorrect least-squares regression coefficients in method-comparison analysis</article-title><source>Clin Chem</source><volume>25</volume><fpage>432</fpage><lpage>8</lpage></element-citation></ref><ref id="bib3"><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Deming</surname><given-names>WE</given-names></name></person-group><year>1943</year><source>Statistical adjustment of data</source><publisher-loc>New York</publisher-loc><publisher-name>Wiley</publisher-name><comment>(Dover Publications edition, 1985; Dover Publications, New York)</comment></element-citation></ref><ref id="bib4"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ford</surname><given-names>ES</given-names></name><name><surname>Giles</surname><given-names>WH</given-names></name><name><surname>Dietz</surname><given-names>WH</given-names></name></person-group><year>2002</year><article-title>Prevalence of metabolic syndrome among US adults: findings from the third National Health and Nutrition Examination Survey</article-title><source>JAMA</source><volume>287</volume><fpage>356</fpage><lpage>9</lpage><pub-id pub-id-type="doi">10.1001/jama.287.3.356</pub-id></element-citation></ref><ref id="bib5"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Glaister</surname><given-names>P</given-names></name></person-group><year>2001</year><article-title>Least squares revisited</article-title><source>Mathematical Gazette</source><volume>85</volume><fpage>104</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.2307/3620485</pub-id></element-citation></ref><ref id="bib6"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Linnet</surname><given-names>K</given-names></name></person-group><year>1993</year><article-title>Evaluation of regression procedures for methods comparison studies</article-title><source>Clin Chem</source><volume>39</volume><fpage>424</fpage><lpage>32</lpage></element-citation></ref><ref id="bib7"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Linnet</surname><given-names>K</given-names></name></person-group><year>1998</year><article-title>Performance of Deming regression analysis in case of misspecified analytical error ratio in method comparison studies</article-title><source>Clin Chem</source><volume>44</volume><fpage>1024</fpage><lpage>31</lpage></element-citation></ref><ref id="bib8"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ressler</surname><given-names>KJ</given-names></name><name><surname>Mercer</surname><given-names>KB</given-names></name><name><surname>Bradley</surname><given-names>B</given-names></name><name><surname>Jovanovic</surname><given-names>T</given-names></name><name><surname>Mahan</surname><given-names>A</given-names></name><name><surname>Kerley</surname><given-names>K</given-names></name><etal/></person-group><year>2011</year><article-title>Post-traumatic stress disorder is associated with PACAP and the PAC1 receptor</article-title><source>Nature</source><volume>470</volume><fpage>492</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.1038/nature09856</pub-id></element-citation></ref></ref-list></back></article>