Ability to tokenize words.

convert_tokens(x, path = FALSE, split_pdf = FALSE,
  remove_hyphen = TRUE, token_function = NULL)

Arguments

x

The text of the pdf file. This can be specified directly or the pdftools package is used to read the pdf file from a file path. To use the pdftools, the path argument must be set to TRUE.

path

An optional path designation for the location of the pdf to be converted to text. The pdftools package is used for this conversion.

split_pdf

TRUE/FALSE indicating whether to split the pdf using white space. This would be most useful with multicolumn pdf files. The split_pdf function attempts to recreate the column layout of the text into a single column starting with the left column and proceeding to the right.

remove_hyphen

TRUE/FALSE indicating whether hyphenated words should be adjusted to combine onto a single line. Default is TRUE.

token_function

This is a function from the tokenizers package. Default is the tokenize_words function.

Value

A list of character vectors containing the tokens. More detail can be found looking at the documentation of the tokenizers package.

Examples

file <- system.file('pdf', '1610.00147.pdf', package = 'pdfsearch') convert_tokens(file, path = TRUE)
#> [[1]] #> [[1]][[1]] #> [1] "data" "fusion" "for" "correcting" #> [5] "measurement" "errors" "tracy" "schifeling" #> [9] "jerome" "p" "reiter" "maria" #> [13] "deyoreo" "arxiv" "1610.00147v1" "stat.me" #> [17] "1" "oct" "2016" "abstract" #> [21] "often" "in" "surveys" "key" #> [25] "items" "are" "subject" "to" #> [29] "measurement" "errors" "given" "just" #> [33] "the" "data" "it" "can" #> [37] "be" "difficult" "to" "determine" #> [41] "the" "distribution" "of" "this" #> [45] "error" "process" "and" "hence" #> [49] "to" "obtain" "accurate" "inferences" #> [53] "that" "involve" "the" "error" #> [57] "prone" "variables" "in" "some" #> [61] "settings" "however" "analysts" "have" #> [65] "access" "to" "a" "data" #> [69] "source" "on" "different" "in" #> [73] "dividuals" "with" "high" "quality" #> [77] "measurements" "of" "the" "error" #> [81] "prone" "survey" "items" "we" #> [85] "present" "a" "data" "fusion" #> [89] "framework" "for" "leveraging" "this" #> [93] "information" "to" "improve" "infer" #> [97] "ences" "in" "the" "error" #> [101] "prone" "survey" "the" "basic" #> [105] "idea" "is" "to" "posit" #> [109] "models" "about" "the" "rates" #> [113] "at" "which" "individuals" "make" #> [117] "errors" "coupled" "with" "models" #> [121] "for" "the" "values" "reported" #> [125] "when" "errors" "are" "made" #> [129] "this" "can" "avoid" "the" #> [133] "unrealistic" "assumption" "of" "conditional" #> [137] "independence" "typically" "used" "in" #> [141] "data" "fusion" "we" "apply" #> [145] "the" "approach" "on" "the" #> [149] "re" "ported" "values" "of" #> [153] "educational" "attainments" "in" "the" #> [157] "american" "community" "survey" "using" #> [161] "the" "national" "survey" "of" #> [165] "college" "graduates" "as" "the" #> [169] "high" "quality" "data" "source" #> [173] "in" "doing" "so" "we" #> [177] "account" "for" "the" "informative" #> [181] "sampling" "design" "used" "to" #> [185] "select" "the" "national" "survey" #> [189] "of" "college" "graduates" "we" #> [193] "also" "present" "a" "process" #> [197] "for" "assessing" "the" "sensitivity" #> [201] "of" "various" "analyses" "to" #> [205] "different" "choices" "for" "the" #> [209] "measurement" "error" "models" "supplemental" #> [213] "material" "is" "available" "online" #> [217] "key" "words" "fusion" "imputation" #> [221] "measurement" "error" "missing" "survey" #> [225] "this" "research" "was" "supported" #> [229] "by" "the" "national" "science" #> [233] "foundation" "under" "award" "ses" #> [237] "11" "31897" "the" "authors" #> [241] "wish" "to" "thank" "seth" #> [245] "sanders" "for" "his" "input" #> [249] "on" "informative" "prior" "specifications" #> [253] "and" "mauricio" "sadinle" "for" #> [257] "discussion" "that" "improved" "the" #> [261] "strategy" "for" "accounting" "for" #> [265] "the" "informative" "sample" "design" #> [269] "1" #> #> #> [[2]] #> [[2]][[1]] #> [1] "1" "introduction" "survey" "data" #> [5] "often" "contain" "items" "that" #> [9] "are" "subject" "to" "measurement" #> [13] "errors" "for" "example" "some" #> [17] "respondents" "might" "misunderstand" "a" #> [21] "question" "or" "accidentally" "select" #> [25] "the" "wrong" "response" "thereby" #> [29] "providing" "values" "unequal" "to" #> [33] "their" "factual" "values" "left" #> [37] "uncorrected" "these" "measurement" "errors" #> [41] "can" "result" "in" "degraded" #> [45] "inferences" "kim" "et" "al" #> [49] "2015" "unfor" "tunately" "the" #> [53] "distribution" "of" "the" "measurement" #> [57] "errors" "typically" "is" "not" #> [61] "estimable" "from" "the" "survey" #> [65] "data" "alone" "one" "either" #> [69] "needs" "to" "make" "strong" #> [73] "assumptions" "about" "the" "measure" #> [77] "ment" "error" "process" "e.g" #> [81] "as" "in" "curran" "and" #> [85] "hussong" "2009" "or" "leverage" #> [89] "information" "from" "some" "other" #> [93] "source" "of" "data" "as" #> [97] "we" "do" "here" "one" #> [101] "natural" "source" "of" "information" #> [105] "is" "a" "validation" "sample" #> [109] "i.e" "a" "dataset" "with" #> [113] "both" "the" "reported" "possibly" #> [117] "erroneous" "values" "and" "the" #> [121] "true" "values" "measured" "on" #> [125] "the" "same" "individuals" "these" #> [129] "individuals" "could" "be" "a" #> [133] "subset" "of" "the" "original" #> [137] "survey" "pepe" "1992" "yucel" #> [141] "and" "zaslavsky" "2005" "or" #> [145] "a" "completely" "distinct" "set" #> [149] "raghunathan" "2006" "schenker" "and" #> [153] "raghunathan" "2007" "schenker" "et" #> [157] "al" "2010" "carrig" "et" #> [161] "al" "2015" "with" "validation" #> [165] "data" "one" "can" "model" #> [169] "the" "relationship" "between" "the" #> [173] "error" "prone" "and" "true" #> [177] "values" "and" "use" "the" #> [181] "model" "to" "replace" "the" #> [185] "error" "prone" "items" "with" #> [189] "multiply" "imputed" "plausible" "true" #> [193] "values" "reiter" "2008" "siddique" #> [197] "et" "al" "2015" "in" #> [201] "many" "settings" "however" "it" #> [205] "is" "not" "possible" "to" #> [209] "obtain" "validation" "samples" "e.g" #> [213] "because" "it" "is" "too" #> [217] "expensive" "or" "because" "someone" #> [221] "other" "than" "the" "analyst" #> [225] "collected" "the" "data" "in" #> [229] "such" "cases" "another" "potential" #> [233] "source" "of" "information" "is" #> [237] "a" "separate" "gold" "stan" #> [241] "dard" "dataset" "that" "includes" #> [245] "true" "or" "at" "least" #> [249] "very" "high" "quality" "measurements" #> [253] "of" "the" "items" "subject" #> [257] "to" "error" "but" "not" #> [261] "the" "error" "prone" "measurements" #> [265] "unlike" "validation" "sam" "ples" #> [269] "the" "gold" "standard" "dataset" #> [273] "alone" "does" "not" "provide" #> [277] "enough" "information" "to" "estimate" #> [281] "the" "relationship" "between" "the" #> [285] "error" "prone" "and" "true" #> [289] "values" "it" "only" "provides" #> [293] "information" "about" "the" "distribution" #> [297] "of" "the" "true" "values" #> [301] "thus" "analysts" "are" "faced" #> [305] "with" "a" "special" "case" #> [309] "2" #> #> #> [[3]] #> [[3]][[1]] #> [1] "of" "data" "fusion" "rubin" #> [5] "1986" "moriarity" "and" "scheuren" #> [9] "2001" "rassler" "2002" "d’orazio" #> [13] "et" "al" "2006" "reiter" #> [17] "2012" "fosdick" "et" "al" #> [21] "2016" "i.e" "integrating" "information" #> [25] "from" "two" "databases" "with" #> [29] "disjoint" "sets" "of" "individuals" #> [33] "and" "distinct" "variables" "one" #> [37] "default" "approach" "common" "in" #> [41] "other" "data" "fusion" "contexts" #> [45] "is" "to" "assume" "that" #> [49] "the" "error" "prone" "and" #> [53] "true" "values" "are" "conditionally" #> [57] "independent" "given" "some" "set" #> [61] "of" "variables" "x" "common" #> [65] "to" "both" "the" "survey" #> [69] "and" "gold" "standard" "data" #> [73] "effectively" "this" "involves" "using" #> [77] "the" "gold" "standard" "data" #> [81] "to" "estimate" "a" "predictive" #> [85] "model" "for" "the" "true" #> [89] "values" "from" "x" "and" #> [93] "applying" "the" "estimated" "model" #> [97] "to" "impute" "replacements" "for" #> [101] "all" "values" "of" "the" #> [105] "error" "prone" "items" "in" #> [109] "the" "survey" "however" "this" #> [113] "conditional" "independence" "assumption" "completely" #> [117] "disregards" "the" "information" "in" #> [121] "the" "error" "prone" "values" #> [125] "which" "sacrifices" "potentially" "useful" #> [129] "information" "for" "example" "consider" #> [133] "national" "surveys" "that" "ask" #> [137] "people" "to" "report" "their" #> [141] "educational" "attainment" "we" "might" #> [145] "expect" "most" "people" "to" #> [149] "report" "values" "accurately" "and" #> [153] "only" "a" "modest" "fraction" #> [157] "to" "make" "errors" "it" #> [161] "does" "not" "make" "sense" #> [165] "to" "alter" "every" "individual’s" #> [169] "reported" "values" "in" "the" #> [173] "survey" "as" "would" "be" #> [177] "done" "using" "a" "conditional" #> [181] "independence" "approach" "in" "this" #> [185] "article" "we" "develop" "a" #> [189] "framework" "for" "leveraging" "information" #> [193] "from" "gold" "stan" "dard" #> [197] "data" "to" "improve" "inferences" #> [201] "in" "surveys" "subject" "to" #> [205] "measurement" "errors" "the" "basic" #> [209] "idea" "is" "to" "encode" #> [213] "plausible" "assumptions" "about" "the" #> [217] "error" "process" "e.g" "most" #> [221] "people" "do" "not" "make" #> [225] "errors" "when" "reporting" "educational" #> [229] "attainments" "and" "the" "reporting" #> [233] "process" "e.g" "when" "people" #> [237] "make" "errors" "they" "are" #> [241] "more" "likely" "to" "report" #> [245] "higher" "attainments" "than" "actual" #> [249] "into" "statistical" "models" "we" #> [253] "couple" "those" "models" "with" #> [257] "distributions" "for" "the" "un" #> [261] "derlying" "true" "data" "values" #> [265] "and" "use" "multiple" "imputation" #> [269] "to" "create" "plausible" "corrections" #> [273] "to" "the" "error" "prone" #> [277] "survey" "values" "which" "then" #> [281] "can" "be" "analyzed" "using" #> [285] "the" "methods" "from" "ru" #> [289] "bin" "1987" "this" "allows" #> [293] "us" "to" "avoid" "unrealistic" #> [297] "conditional" "independence" "assumptions" "in" #> [301] "lieu" "of" "more" "scientifically" #> [305] "defensible" "models" "the" "remainder" #> [309] "of" "this" "article" "is" #> [313] "organized" "as" "follows" "in" #> [317] "section" "2" "we" "review" #> [321] "an" "3" #> #> #> [[4]] #> [[4]][[1]] #> [1] "example" "of" "misreporting" "of" #> [5] "educational" "attainment" "in" "data" #> [9] "collected" "by" "the" "census" #> [13] "bureau" "so" "as" "to" #> [17] "motivate" "the" "methodological" "developments" #> [21] "in" "section" "3" "we" #> [25] "intro" "duce" "the" "general" #> [29] "framework" "for" "specifying" "measurement" #> [33] "error" "models" "to" "leverage" #> [37] "the" "information" "in" "gold" #> [41] "standard" "data" "in" "section" #> [45] "4" "we" "apply" "the" #> [49] "framework" "to" "handle" "po" #> [53] "tential" "measurement" "error" "in" #> [57] "educational" "attainment" "in" "the" #> [61] "2010" "american" "community" "survey" #> [65] "acs" "using" "the" "2010" #> [69] "national" "survey" "of" "college" #> [73] "graduates" "nscg" "as" "a" #> [77] "gold" "standard" "file" "in" #> [81] "doing" "so" "we" "deal" #> [85] "with" "a" "key" "complication" #> [89] "in" "the" "data" "integration" #> [93] "accounting" "for" "the" "informative" #> [97] "sampling" "design" "used" "to" #> [101] "sample" "the" "nscg" "we" #> [105] "also" "demonstrate" "how" "the" #> [109] "framework" "facilitates" "analysis" "of" #> [113] "the" "sensitivity" "of" "conclusions" #> [117] "to" "different" "measurement" "error" #> [121] "model" "specifications" "in" "section" #> [125] "5" "we" "provide" "a" #> [129] "brief" "summary" "2" "misreporting" #> [133] "in" "educational" "attainment" "to" #> [137] "illustrate" "the" "potential" "for" #> [141] "reporting" "errors" "in" "educational" #> [145] "attainment" "that" "can" "arise" #> [149] "in" "surveys" "we" "examine" #> [153] "data" "from" "the" "1993" #> [157] "nscg" "the" "1993" "nscg" #> [161] "surveyed" "individuals" "who" "indicated" #> [165] "on" "the" "1990" "census" #> [169] "long" "form" "that" "they" #> [173] "had" "at" "least" "a" #> [177] "college" "degree" "fesco" "et" #> [181] "al" "2012" "the" "questionnaire" #> [185] "asked" "about" "educational" "attainment" #> [189] "including" "detailed" "questions" "about" #> [193] "educational" "histories" "these" "questions" #> [197] "greatly" "reduce" "the" "possibility" #> [201] "of" "respondent" "error" "so" #> [205] "that" "the" "educational" "attainment" #> [209] "values" "in" "the" "nscg" #> [213] "can" "be" "considered" "a" #> [217] "gold" "standard" "black" "et" #> [221] "al" "2003" "the" "census" #> [225] "long" "form" "in" "contrast" #> [229] "did" "not" "include" "detailed" #> [233] "follow" "up" "questions" "so" #> [237] "that" "reported" "educational" "attainment" #> [241] "is" "prone" "to" "measurement" #> [245] "error" "the" "census" "bureau" #> [249] "linked" "each" "individual" "in" #> [253] "the" "nscg" "to" "their" #> [257] "corresponding" "record" "in" "the" #> [261] "long" "form" "data" "the" #> [265] "linked" "file" "is" "available" #> [269] "for" "download" "from" "the" #> [273] "inter" "university" "consortium" "for" #> [277] "political" "and" "social" "research" #> [281] "national" "science" "foundation" "4" #> #> #> [[5]] #> [[5]][[1]] #> [1] "table" "1" "unweighted" "cross" "tabulation" #> [6] "of" "reported" "education" "in" "the" #> [11] "nscg" "and" "census" "long" "form" #> [16] "from" "the" "linked" "dataset" "ba" #> [21] "stands" "for" "bachelor’s" "degree" "ma" #> [26] "stands" "for" "master’s" "degree" "prof" #> [31] "stands" "for" "professional" "degree" "and" #> [36] "phd" "stands" "for" "ph" "d" #> [41] "degree" "the" "14,319" "individuals" "in" #> [46] "the" "group" "labeled" "no" "degree" #> [51] "did" "not" "have" "a" "college" #> [56] "degree" "despite" "reporting" "otherwise" "the" #> [61] "51,396" "individuals" "in" "the" "group" #> [66] "labeled" "other" "did" "not" "have" #> [71] "one" "of" "ba" "ma" "prof" #> [76] "phd" "and" "are" "discarded" "from" #> [81] "subsequent" "analyses" "census" "reported" "education" #> [86] "z" "ba" "ma" "prof" "phd" #> [91] "total" "ba" "89580" "4109" "1241" #> [96] "249" "95179" "nscg" "ma" "1218" #> [101] "33928" "655" "526" "36327" "reported" #> [106] "prof" "382" "359" "8648" "563" #> [111] "9952" "education" "phd" "99" "193" #> [116] "452" "6726" "7470" "total" "91279" #> [121] "38589" "10996" "8064" "148928" "no" #> [126] "degree" "10150" "1792" "2040" "337" #> [131] "14319" "other" "33368" "10912" "4710" #> [136] "2406" "51396" "1993" "because" "of" #> [141] "the" "linkages" "we" "can" "characterize" #> [146] "the" "actual" "measurement" "error" "mechanism" #> [151] "for" "educational" "attainment" "in" "the" #> [156] "1990" "long" "form" "data" "in" #> [161] "the" "nscg" "we" "treat" "the" #> [166] "highest" "degree" "of" "the" "three" #> [171] "most" "recent" "degrees" "reported" "coded" #> [176] "as" "ed6c1" "ed6c2" "and" "ed6c3" #> [181] "in" "the" "file" "as" "the" #> [186] "true" "education" "level" "we" "disregard" #> [191] "any" "degrees" "earned" "in" "the" #> [196] "years" "1990" "1993" "as" "these" #> [201] "occur" "in" "the" "three" "year" #> [206] "gap" "between" "collection" "of" "the" #> [211] "long" "form" "and" "nscg" "data" #> [216] "this" "ensures" "consistent" "time" "frames" #> [221] "for" "the" "nscg" "and" "long" #> [226] "form" "reported" "values" "we" "cross" #> [231] "tabulate" "these" "degrees" "with" "the" #> [236] "degrees" "reported" "in" "the" "long" #> [241] "form" "data" "coded" "yearsch" "in" #> [246] "the" "file" "table" "1" "displays" #> [251] "the" "cross" "tabulation" "a" "similar" #> [256] "analysis" "was" "done" "by" "black" #> [261] "et" "al" "2003" "as" "evident" #> [266] "in" "table" "1" "reported" "education" #> [271] "levels" "on" "the" "long" "form" #> [276] "often" "are" "higher" "than" "those" #> [281] "on" "the" "nscg" "particularly" "for" #> [286] "individuals" "with" "only" "a" "bachelor’s" #> [291] "degree" "of" "the" "163,247" "individuals" #> [296] "in" "scope" "in" "the" "nscg" #> [301] "over" "14,000" "were" "determined" "not" #> [306] "to" "have" "at" "least" "a" #> [311] "bachelor’s" "degree" "when" "asked" "in" #> [316] "the" "nscg" "despite" "reporting" "otherwise" #> [321] "5" #> #> #> [[6]] #> [[6]][[1]] #> [1] "in" "the" "long" "form" #> [5] "a" "whopping" "33" "of" #> [9] "individuals" "who" "reported" "being" #> [13] "professionals" "in" "the" "long" #> [17] "form" "actually" "are" "not" #> [21] "professionals" "according" "to" "the" #> [25] "nscg" "one" "possible" "explanation" #> [29] "for" "this" "error" "is" #> [33] "confusion" "over" "the" "definition" #> [37] "of" "professionals" "the" "census" #> [41] "bureau" "intended" "the" "category" #> [45] "to" "capture" "graduate" "degrees" #> [49] "from" "universities" "e.g" "j.d" #> [53] "m.b.a" "m.d" "whereas" "black" #> [57] "et" "al" "2003" "found" #> [61] "that" "individuals" "in" "professions" #> [65] "such" "as" "cosmetology" "nursing" #> [69] "and" "health" "services" "which" #> [73] "require" "certifications" "but" "not" #> [77] "graduate" "degrees" "selected" "the" #> [81] "category" "in" "spite" "of" #> [85] "the" "nontrivial" "reporting" "error" #> [89] "the" "overwhelming" "majority" "of" #> [93] "individuals" "reported" "education" "levels" #> [97] "are" "consistent" "in" "the" #> [101] "long" "form" "and" "in" #> [105] "the" "nscg" "of" "the" #> [109] "individuals" "in" "the" "nscg" #> [113] "who" "had" "at" "least" #> [117] "a" "college" "degree" "at" #> [121] "the" "time" "of" "the" #> [125] "1990" "census" "about" "93.3" #> [129] "of" "them" "have" "the" #> [133] "same" "contemporaneous" "education" "levels" #> [137] "in" "both" "files" "this" #> [141] "suggests" "that" "most" "people" #> [145] "report" "correctly" "an" "observation" #> [149] "we" "want" "to" "leverage" #> [153] "when" "constructing" "measurement" "error" #> [157] "models" "for" "education" "in" #> [161] "the" "2010" "acs" "in" #> [165] "most" "situations" "we" "do" #> [169] "not" "have" "the" "good" #> [173] "fortune" "of" "observing" "individuals" #> [177] "error" "prone" "and" "true" #> [181] "values" "simultaneously" "instead" "we" #> [185] "are" "in" "the" "setting" #> [189] "represented" "by" "figure" "1" #> [193] "this" "is" "also" "the" #> [197] "case" "in" "our" "analysis" #> [201] "of" "educational" "attainments" "in" #> [205] "the" "2010" "acs" "described" #> [209] "in" "section" "4" "the" #> [213] "sampling" "frame" "for" "the" #> [217] "2010" "nscg" "is" "constructed" #> [221] "from" "reported" "education" "levels" #> [225] "in" "the" "acs" "which" #> [229] "replaced" "the" "long" "form" #> [233] "after" "the" "2000" "census" #> [237] "however" "unlike" "in" "1993" #> [241] "linked" "data" "are" "not" #> [245] "available" "as" "public" "use" #> [249] "files" "therefore" "we" "treat" #> [253] "the" "2010" "nscg" "as" #> [257] "gold" "standard" "data" "and" #> [261] "posit" "measurement" "models" "that" #> [265] "connect" "the" "information" "from" #> [269] "the" "two" "data" "sources" #> [273] "using" "the" "framework" "that" #> [277] "we" "now" "describe" "6" #> #> #> [[7]] #> [[7]][[1]] #> [1] "x" "y" "z" "de" #> [5] "x" "x" "dg" "x" #> [9] "x" "figure" "1" "graphical" #> [13] "representation" "of" "data" "fusion" #> [17] "set" "up" "in" "the" #> [21] "survey" "data" "de" "we" #> [25] "only" "observe" "the" "error" #> [29] "prone" "measurement" "z" "but" #> [33] "not" "the" "true" "value" #> [37] "y" "in" "the" "gold" #> [41] "standard" "data" "dg" "we" #> [45] "only" "observe" "y" "but" #> [49] "not" "z" "we" "observe" #> [53] "variables" "x" "in" "both" #> [57] "samples" "3" "measurement" "error" #> [61] "modeling" "via" "data" "fusion" #> [65] "as" "in" "figure" "1" #> [69] "let" "de" "and" "dg" #> [73] "be" "two" "data" "sources" #> [77] "comprising" "distinct" "individuals" "with" #> [81] "sample" "sizes" "ne" "and" #> [85] "ng" "respectively" "for" "each" #> [89] "individual" "i" "in" "dg" #> [93] "or" "de" "let" "xi" #> [97] "xi1" "xip" "be" "variables" #> [101] "common" "to" "both" "surveys" #> [105] "such" "as" "demographic" "variables" #> [109] "we" "assume" "these" "variables" #> [113] "have" "been" "harmonized" "d’orazio" #> [117] "et" "al" "2006" "across" #> [121] "dg" "and" "de" "and" #> [125] "are" "free" "of" "errors" #> [129] "let" "y" "represent" "the" #> [133] "error" "free" "values" "of" #> [137] "some" "variable" "of" "interest" #> [141] "and" "let" "z" "be" #> [145] "an" "error" "prone" "version" #> [149] "of" "y" "we" "observe" #> [153] "z" "but" "not" "y" #> [157] "for" "the" "ne" "individuals" #> [161] "in" "de" "we" "observe" #> [165] "y" "but" "not" "z" #> [169] "for" "the" "ng" "individuals" #> [173] "in" "dg" "for" "simplicity" #> [177] "of" "notation" "we" "assume" #> [181] "no" "missing" "values" "in" #> [185] "any" "variable" "although" "the" #> [189] "multiple" "imputation" "framework" "easily" #> [193] "handles" "missing" "values" "additionally" #> [197] "de" "can" "include" "variables" #> [201] "for" "which" "there" "is" #> [205] "no" "corresponding" "variable" "in" #> [209] "dg" "these" "variables" "do" #> [213] "not" "play" "a" "role" #> [217] "in" "the" "measurement" "error" #> [221] "modeling" "although" "they" "can" #> [225] "be" "used" "in" "multiple" #> [229] "imputation" "inferences" "we" "seek" #> [233] "to" "estimate" "pr" "y" #> [237] "z" "x" "and" "use" #> [241] "it" "to" "create" "multiple" #> [245] "imputations" "for" "the" "missing" #> [249] "values" "in" "y" "for" #> [253] "the" "individuals" "in" "de" #> [257] "we" "do" "so" "for" #> [261] "the" "common" "setting" "where" #> [265] "x" "y" "z" "are" #> [269] "all" "categorical" "variables" "similar" #> [273] "ideas" "apply" "for" "other" #> [277] "data" "types" "for" "j" #> [281] "1" "p" "let" "each" #> [285] "xj" "have" "dj" "levels" #> [289] "let" "z" "have" "dz" #> [293] "levels" "and" "y" "have" #> [297] "dy" "7" #> #> #> [[8]] #> [[8]][[1]] #> [1] "levels" "typically" "dz" "dy" #> [5] "but" "this" "need" "not" #> [9] "be" "the" "case" "generally" #> [13] "for" "example" "in" "the" #> [17] "nscg" "acs" "application" "z" #> [21] "is" "the" "educational" "attainment" #> [25] "among" "those" "who" "report" #> [29] "a" "college" "degree" "in" #> [33] "the" "acs" "which" "has" #> [37] "dz" "4" "levels" "bachelor’s" #> [41] "degree" "master’s" "degree" "professional" #> [45] "degree" "or" "ph" "d" #> [49] "degree" "and" "y" "is" #> [53] "the" "educational" "attainment" "in" #> [57] "the" "nscg" "which" "has" #> [61] "dy" "5" "levels" "an" #> [65] "additional" "level" "is" "needed" #> [69] "because" "some" "individuals" "in" #> [73] "the" "nscg" "truly" "do" #> [77] "not" "have" "a" "college" #> [81] "degree" "for" "all" "i" #> [85] "de" "let" "ei" "be" #> [89] "an" "unobserved" "indicator" "of" #> [93] "a" "reporting" "error" "that" #> [97] "is" "ei" "1" "when" #> [101] "yi" "6" "zi" "and" #> [105] "ei" "0" "otherwise" "using" #> [109] "e" "enables" "us" "to" #> [113] "write" "pr" "y" "z" #> [117] "x" "as" "a" "product" #> [121] "of" "three" "sub" "models" #> [125] "for" "individual" "i" "the" #> [129] "full" "data" "likelihood" "omitting" #> [133] "parameters" "for" "simplicity" "can" #> [137] "be" "factored" "as" "pr" #> [141] "yi" "k" "zi" "l" #> [145] "xi" "pr" "yi" "k" #> [149] "xi" "pr" "ei" "e" #> [153] "yi" "k" "xi" "pr" #> [157] "zi" "l" "ei" "e" #> [161] "yi" "k" "xi" "1" #> [165] "this" "separates" "the" "true" #> [169] "data" "generation" "process" "and" #> [173] "the" "measurement" "error" "generation" #> [177] "process" "which" "facilitates" "model" #> [181] "specification" "in" "particular" "we" #> [185] "can" "use" "dg" "to" #> [189] "estimate" "the" "true" "data" #> [193] "distribution" "pr" "y" "x" #> [197] "we" "then" "can" "posit" #> [201] "different" "models" "for" "the" #> [205] "rates" "of" "making" "errors" #> [209] "pr" "ei" "e" "yi" #> [213] "k" "xi" "and" "for" #> [217] "the" "reported" "values" "when" #> [221] "errors" "are" "made" "pr" #> [225] "zi" "l" "ei" "1" #> [229] "yi" "k" "xi" "intuitively" #> [233] "the" "error" "model" "locates" #> [237] "the" "records" "for" "which" #> [241] "yi" "6" "zi" "and" #> [245] "the" "reporting" "model" "captures" #> [249] "the" "patterns" "of" "misreported" #> [253] "zi" "of" "course" "when" #> [257] "ei" "0" "pr" "zi" #> [261] "yi" "1" "a" "similar" #> [265] "factorization" "is" "used" "by" #> [269] "yucel" "and" "zaslavsky" "2005" #> [273] "he" "et" "al" "2014" #> [277] "kim" "et" "al" "2015" #> [281] "and" "manrique" "vallier" "and" #> [285] "reiter" "2016" "among" "others" #> [289] "by" "construction" "dg" "and" #> [293] "de" "cannot" "be" "used" #> [297] "to" "estimate" "any" "of" #> [301] "the" "conditional" "probabilities" "pr" #> [305] "y" "z" "x" "directly" #> [309] "hence" "we" "have" "to" #> [313] "restrict" "the" "number" "and" #> [317] "types" "of" "parameters" "in" #> [321] "the" "sub" "models" "in" #> [325] "1" "put" "another" "way" #> [329] "if" "we" "tried" "to" #> [333] "estimate" "a" "fully" "8" #> #> #> [[9]] #> [[9]][[1]] #> [1] "saturated" "model" "for" "e" #> [5] "z" "x" "we" "would" #> [9] "not" "be" "able" "to" #> [13] "identify" "all" "the" "parameters" #> [17] "by" "using" "dg" "and" #> [21] "de" "alone" "to" "see" #> [25] "this" "assume" "for" "the" #> [29] "moment" "that" "all" "dx" #> [33] "πpj" "1" "dj" "possible" #> [37] "combinations" "of" "x" "are" #> [41] "present" "in" "dg" "and" #> [45] "de" "to" "estimate" "the" #> [49] "distribution" "of" "e" "z" #> [53] "x" "using" "a" "fully" #> [57] "saturated" "model" "we" "require" #> [61] "dy" "1" "dx" "dz" #> [65] "1" "dy" "dx" "dy" #> [69] "dz" "1" "dx" "independent" #> [73] "pieces" "of" "information" "from" #> [77] "dg" "de" "where" "each" #> [81] "subtraction" "of" "one" "derives" #> [85] "from" "the" "requirement" "that" #> [89] "probabilities" "sum" "to" "one" #> [93] "however" "dg" "and" "de" #> [97] "together" "provide" "only" "dz" #> [101] "1" "dx" "dy" "1" #> [105] "dx" "dx" "dz" "dy" #> [109] "1" "dx" "independent" "pieces" #> [113] "of" "information" "where" "we" #> [117] "add" "a" "dx" "to" #> [121] "properly" "account" "for" "the" #> [125] "sum" "to" "one" "constraint" #> [129] "a" "key" "insight" "here" #> [133] "is" "that" "since" "the" #> [137] "true" "data" "model" "requires" #> [141] "dy" "dx" "parameters" "to" #> [145] "estimate" "the" "joint" "distribution" #> [149] "for" "y" "x" "the" #> [153] "data" "can" "identify" "at" #> [157] "most" "dz" "1" "dx" #> [161] "parameters" "in" "the" "error" #> [165] "and" "reporting" "models" "combined" #> [169] "related" "identification" "issues" "arise" #> [173] "in" "the" "context" "of" #> [177] "refreshment" "sampling" "to" "adjust" #> [181] "for" "nonignorable" "attrition" "in" #> [185] "longitudinal" "studies" "hirano" "et" #> [189] "al" "2001" "schifeling" "et" #> [193] "al" "2015" "si" "et" #> [197] "al" "2015" "3.1" "true" #> [201] "data" "model" "pr" "yi" #> [205] "k" "xi" "one" "can" #> [209] "use" "any" "model" "for" #> [213] "y" "x" "that" "adequately" #> [217] "describes" "the" "conditional" "distri" #> [221] "bution" "such" "as" "a" #> [225] "multinomial" "logistic" "regression" "in" #> [229] "the" "nscg" "acs" "application" #> [233] "we" "use" "a" "fully" #> [237] "saturated" "multinomial" "model" "accounting" #> [241] "for" "the" "informative" "sampling" #> [245] "design" "in" "dg" "using" #> [249] "the" "approach" "described" "in" #> [253] "section" "4.1" "one" "also" #> [257] "could" "use" "a" "joint" #> [261] "distribution" "for" "y" "x" #> [265] "such" "as" "a" "log" #> [269] "linear" "model" "or" "a" #> [273] "mixture" "of" "multinomials" "model" #> [277] "dunson" "and" "xing" "2009" #> [281] "si" "and" "reiter" "2013" #> [285] "9" #> #> #> [[10]] #> [[10]][[1]] #> [1] "3.2" "error" "model" "pr" #> [5] "ei" "1" "yi" "xi" #> [9] "in" "cases" "where" "dy" #> [13] "dz" "a" "generic" "form" #> [17] "for" "the" "error" "model" #> [21] "is" "pr" "ei" "1" #> [25] "xi" "yi" "k" "g" #> [29] "xi" "yi" "β" "2" #> [33] "where" "g" "xi" "yi" #> [37] "β" "is" "some" "function" #> [41] "of" "its" "arguments" "and" #> [45] "β" "is" "some" "set" #> [49] "of" "unknown" "parameters" "a" #> [53] "convenient" "class" "of" "functions" #> [57] "that" "we" "use" "here" #> [61] "is" "the" "logistic" "regression" #> [65] "of" "ei" "on" "some" #> [69] "design" "vector" "mi" "derived" #> [73] "from" "xi" "yi" "with" #> [77] "corresponding" "coefficients" "β" "the" #> [81] "analyst" "can" "encode" "different" #> [85] "versions" "of" "mi" "to" #> [89] "represent" "assumptions" "about" "the" #> [93] "error" "process" "the" "simplest" #> [97] "specification" "is" "to" "set" #> [101] "each" "mi" "equal" "to" #> [105] "a" "vector" "of" "ones" #> [109] "which" "implies" "that" "there" #> [113] "is" "a" "common" "probability" #> [117] "of" "error" "for" "all" #> [121] "individuals" "this" "error" "model" #> [125] "makes" "sense" "when" "the" #> [129] "analyst" "believes" "the" "errors" #> [133] "in" "z" "occur" "completely" #> [137] "at" "random" "for" "example" #> [141] "when" "errors" "arise" "simply" #> [145] "because" "respondents" "accidentally" "and" #> [149] "randomly" "select" "the" "wrong" #> [153] "response" "in" "the" "survey" #> [157] "or" "when" "all" "respondents" #> [161] "are" "equally" "likely" "to" #> [165] "misunderstand" "the" "survey" "question" #> [169] "a" "more" "realistic" "possibility" #> [173] "is" "to" "allow" "the" #> [177] "probability" "of" "error" "to" #> [181] "depend" "on" "some" "variables" #> [185] "in" "xi" "but" "not" #> [189] "on" "yi" "e.g" "men" #> [193] "misreport" "education" "at" "different" #> [197] "rates" "than" "women" "this" #> [201] "could" "be" "encoded" "by" #> [205] "including" "an" "intercept" "for" #> [209] "one" "of" "the" "sexes" #> [213] "in" "mi" "finally" "one" #> [217] "can" "allow" "the" "probability" #> [221] "of" "error" "to" "depend" #> [225] "on" "yi" "itself" "for" #> [229] "example" "people" "who" "truly" #> [233] "do" "not" "have" "at" #> [237] "least" "a" "college" "degree" #> [241] "are" "more" "likely" "to" #> [245] "misreport" "by" "including" "some" #> [249] "function" "of" "it" "in" #> [253] "mi" "in" "the" "case" #> [257] "where" "dz" "6" "dy" #> [261] "as" "in" "the" "nscg" #> [265] "acs" "application" "we" "automatically" #> [269] "set" "ei" "1" "for" #> [273] "any" "individual" "with" "yi" #> [277] "1" "dz" "for" "example" #> [281] "we" "set" "ei" "1" #> [285] "for" "all" "individuals" "who" #> [289] "are" "determined" "in" "the" #> [293] "nscg" "not" "to" "have" #> [297] "a" "college" "degree" "but" #> [301] "report" "so" "in" "the" #> [305] "acs" "the" "stochastic" "part" #> [309] "of" "the" "error" "model" #> [313] "only" "applies" "to" "individuals" #> [317] "who" "truly" "have" "at" #> [321] "least" "a" "bachelor’s" "degree" #> [325] "10" #> #> #> [[11]] #> [[11]][[1]] #> [1] "3.3" "reporting" "model" "pr" #> [5] "zi" "ei" "1" "yi" #> [9] "xi" "when" "there" "is" #> [13] "no" "reporting" "error" "for" #> [17] "individual" "i" "i.e" "ei" #> [21] "0" "we" "know" "that" #> [25] "zi" "yi" "when" "there" #> [29] "is" "a" "reporting" "error" #> [33] "we" "must" "model" "the" #> [37] "reported" "value" "zi" "as" #> [41] "with" "2" "one" "can" #> [45] "posit" "a" "variety" "of" #> [49] "distributions" "for" "the" "reporting" #> [53] "error" "which" "is" "some" #> [57] "function" "h" "xi" "yi" #> [61] "α" "with" "parameters" "α" #> [65] "we" "now" "describe" "a" #> [69] "few" "reporting" "error" "models" #> [73] "for" "illustration" "one" "could" #> [77] "use" "more" "complicated" "models" #> [81] "e.g" "based" "on" "multinomial" #> [85] "logistic" "regression" "as" "well" #> [89] "a" "simple" "model" "assumes" #> [93] "that" "values" "of" "zi" #> [97] "are" "equally" "likely" "as" #> [101] "in" "manrique" "vallier" "and" #> [105] "reiter" "2016" "we" "have" #> [109] "1" "dz" "1" "if" #> [113] "l" "6" "k" "k" #> [117] "1" "dz" "pr" "zi" #> [121] "l" "xi" "yi" "k" #> [125] "ei" "1" "1" "dz" #> [129] "if" "k" "1" "dz" #> [133] "3" "0" "otherwise" "such" #> [137] "a" "reporting" "model" "could" #> [141] "be" "reasonable" "when" "reporting" #> [145] "errors" "are" "due" "to" #> [149] "clerical" "errors" "we" "note" #> [153] "that" "this" "model" "does" #> [157] "not" "accurately" "characterize" "the" #> [161] "reporting" "errors" "in" "the" #> [165] "1993" "linked" "nscg" "data" #> [169] "per" "table" "1" "alternatively" #> [173] "one" "can" "allow" "the" #> [177] "probabilities" "to" "depend" "on" #> [181] "yi" "so" "that" "zi" #> [185] "xi" "yi" "k" "ei" #> [189] "1" "categorical" "pk" "1" #> [193] "pk" "dz" "4" "where" #> [197] "each" "pk" "l" "is" #> [201] "the" "probability" "of" "reporting" #> [205] "z" "l" "given" "that" #> [209] "y" "k" "and" "pk" #> [213] "k" "0" "one" "can" #> [217] "further" "parameterize" "the" "reporting" #> [221] "model" "so" "that" "the" #> [225] "reporting" "probabilities" "vary" "with" #> [229] "x" "for" "example" "to" #> [233] "make" "the" "probabilities" "vary" #> [237] "with" "sex" "and" "true" #> [241] "education" "11" #> #> #> [[12]] #> [[12]][[1]] #> [1] "values" "we" "can" "use" #> [5] "categorical" "pm" "k" "1" #> [9] "pm" "k" "dz" "if" #> [13] "xi" "sex" "m" "zi" #> [17] "xi" "yi" "k" "ei" #> [21] "1" "5" "categorical" "p" #> [25] "f" "k" "1" "pf" #> [29] "k" "dz" "if" "xi" #> [33] "sex" "f" "3.4" "specifying" #> [37] "and" "estimating" "the" "model" #> [41] "as" "apparent" "in" "sections" #> [45] "3.2" "and" "3.3" "the" #> [49] "error" "and" "reporting" "models" #> [53] "can" "take" "on" "many" #> [57] "specifications" "without" "linked" "data" #> [61] "analysts" "cannot" "use" "exploratory" #> [65] "data" "analysis" "to" "inform" #> [69] "the" "model" "choice" "instead" #> [73] "we" "recommend" "that" "analysts" #> [77] "posit" "scientifically" "defensible" "measurement" #> [81] "error" "models" "and" "make" #> [85] "post" "hoc" "checks" "of" #> [89] "the" "sensibility" "of" "analyses" #> [93] "from" "those" "models" "we" #> [97] "demonstrate" "this" "approach" "in" #> [101] "section" "4" "for" "example" #> [105] "analysts" "can" "check" "whether" #> [109] "or" "not" "the" "predicted" #> [113] "probabilities" "of" "errors" "implied" #> [117] "by" "the" "model" "seem" #> [121] "plausible" "as" "another" "diagnostic" #> [125] "analysts" "can" "compare" "the" #> [129] "distribution" "of" "the" "imputed" #> [133] "values" "of" "y" "x" #> [137] "in" "de" "to" "the" #> [141] "empirical" "distribution" "of" "y" #> [145] "x" "in" "dg" "this" #> [149] "is" "akin" "to" "diagnostics" #> [153] "in" "multiple" "imputation" "for" #> [157] "missing" "data" "that" "compare" #> [161] "imputed" "and" "observed" "values" #> [165] "abayomi" "et" "al" "2008" #> [169] "when" "these" "distributions" "differ" #> [173] "substantially" "it" "suggests" "the" #> [177] "measurement" "error" "model" "specification" #> [181] "or" "possibly" "the" "true" #> [185] "data" "model" "is" "inadequate" #> [189] "such" "diagnostic" "checks" "only" #> [193] "can" "reveal" "problems" "with" #> [197] "the" "model" "specification" "they" #> [201] "do" "not" "indicate" "that" #> [205] "a" "particular" "specification" "is" #> [209] "correct" "more" "generally" "it" #> [213] "is" "prudent" "to" "keep" #> [217] "the" "restrictions" "on" "the" #> [221] "number" "of" "identifiable" "parameters" #> [225] "in" "mind" "when" "specifying" #> [229] "the" "models" "at" "most" #> [233] "one" "can" "identify" "the" #> [237] "equiv" "alent" "of" "dz" #> [241] "1" "dx" "parameters" "in" #> [245] "the" "combined" "model" "for" #> [249] "ei" "zi" "xi" "generally" #> [253] "for" "ease" "of" "specification" #> [257] "and" "interpretation" "we" "favor" #> [261] "rich" "error" "models" "e.g" #> [265] "with" "mi" "including" "variables" #> [269] "in" "xi" "and" "yi" #> [273] "coupled" "with" "simple" "reporting" #> [277] "models" "like" "those" "in" #> [281] "section" "3.3" "the" "exact" #> [285] "strategy" "for" "estimating" "the" #> [289] "model" "depends" "on" "the" #> [293] "features" "of" "dg" "and" #> [297] "de" "12" #> #> #> [[13]] #> [[13]][[1]] #> [1] "when" "both" "datasets" "can" #> [5] "be" "treated" "as" "simple" #> [9] "random" "samples" "we" "suggest" #> [13] "using" "a" "fully" "bayesian" #> [17] "approach" "after" "concatenating" "dg" #> [21] "and" "de" "here" "one" #> [25] "can" "use" "typical" "prior" #> [29] "distributions" "for" "the" "true" #> [33] "data" "and" "error" "models" #> [37] "for" "reporting" "models" "like" #> [41] "those" "in" "4" "and" #> [45] "5" "it" "is" "convenient" #> [49] "to" "use" "independent" "dirichlet" #> [53] "priors" "for" "each" "pk" #> [57] "1" "pk" "k" "1" #> [61] "pk" "k" "1" "pk" #> [65] "dz" "in" "the" "nscg" #> [69] "acs" "application" "we" "create" #> [73] "prior" "distributions" "for" "the" #> [77] "reporting" "models" "using" "the" #> [81] "information" "from" "table" "1" #> [85] "absent" "such" "information" "analysts" #> [89] "can" "use" "uniform" "prior" #> [93] "distributions" "when" "it" "does" #> [97] "not" "make" "sense" "to" #> [101] "concatenate" "dg" "and" "de" #> [105] "it" "can" "be" "convenient" #> [109] "to" "use" "a" "multi" #> [113] "stage" "estimation" "strategy" "when" #> [117] "imputing" "missing" "y" "in" #> [121] "de" "all" "of" "the" #> [125] "information" "needed" "from" "dg" #> [129] "is" "represented" "by" "the" #> [133] "parameters" "of" "the" "true" #> [137] "data" "model" "θ" "hence" #> [141] "we" "first" "can" "construct" #> [145] "a" "possibly" "approximate" "posterior" #> [149] "distribution" "of" "θ" "using" #> [153] "only" "dg" "we" "then" #> [157] "sample" "many" "draws" "from" #> [161] "this" "distribution" "we" "plug" #> [165] "these" "draws" "in" "the" #> [169] "gibbs" "sampling" "steps" "for" #> [173] "a" "bayesian" "predictive" "distribution" #> [177] "for" "yi" "zi" "xi" #> [181] "θ" "for" "the" "cases" #> [185] "in" "de" "thereby" "generating" #> [189] "the" "multiple" "imputations" "we" #> [193] "describe" "the" "gibbs" "sampler" #> [197] "for" "this" "step" "for" #> [201] "the" "nscg" "acs" "application" #> [205] "in" "the" "supplementary" "material" #> [209] "4" "adjusting" "for" "reporting" #> [213] "errors" "in" "education" "in" #> [217] "the" "2010" "acs" "we" #> [221] "now" "use" "the" "framework" #> [225] "to" "adjust" "inferences" "for" #> [229] "potential" "reporting" "error" "in" #> [233] "educa" "tional" "attainment" "in" #> [237] "the" "2010" "acs" "using" #> [241] "the" "public" "use" "microdata" #> [245] "for" "the" "2010" "nscg" #> [249] "as" "the" "gold" "standard" #> [253] "file" "dg" "we" "consider" #> [257] "two" "main" "analyses" "that" #> [261] "could" "be" "affected" "by" #> [265] "reporting" "error" "in" "education" #> [269] "first" "we" "estimate" "from" #> [273] "the" "acs" "the" "number" #> [277] "of" "science" "and" "engineering" #> [281] "degrees" "awarded" "to" "women" #> [285] "we" "base" "the" "estimate" #> [289] "on" "an" "indicator" "in" #> [293] "the" "acs" "for" "whether" #> [297] "or" "not" "each" "individual" #> [301] "has" "such" "a" "degree" #> [305] "second" "we" "examine" "13" #> #> #> [[14]] #> [[14]][[1]] #> [1] "average" "incomes" "across" "degrees" #> [5] "this" "focus" "is" "motivated" #> [9] "in" "part" "by" "the" #> [13] "findings" "of" "black" "et" #> [17] "al" "2006" "2008" "who" #> [21] "found" "that" "apparent" "wage" #> [25] "gaps" "in" "the" "1990" #> [29] "census" "long" "form" "data" #> [33] "could" "be" "explained" "by" #> [37] "reporting" "errors" "in" "education" #> [41] "as" "de" "we" "use" #> [45] "the" "subset" "of" "acs" #> [49] "microdata" "that" "includes" "only" #> [53] "individuals" "who" "reported" "a" #> [57] "bachelor’s" "degree" "or" "higher" #> [61] "and" "are" "under" "age" #> [65] "76" "the" "resulting" "sample" #> [69] "size" "is" "ne" "600" #> [73] "150" "in" "x" "we" #> [77] "include" "gender" "age" "group" #> [81] "24" "and" "younger" "25" #> [85] "39" "40" "54" "and" #> [89] "55" "and" "older" "and" #> [93] "an" "indicator" "for" "whether" #> [97] "the" "individual’s" "race" "is" #> [101] "black" "or" "something" "else" #> [105] "in" "the" "nscg" "we" #> [109] "discarded" "38" "records" "with" #> [113] "race" "suppressed" "leaving" "a" #> [117] "sample" "size" "of" "ng" #> [121] "77" "150" "we" "consider" #> [125] "two" "sets" "of" "measurement" #> [129] "error" "model" "specifications" "the" #> [133] "first" "set" "uses" "specifications" #> [137] "like" "those" "in" "section" #> [141] "3" "with" "flat" "prior" #> [145] "distributions" "for" "all" "parameters" #> [149] "we" "use" "this" "set" #> [153] "to" "illustrate" "model" "diagnostics" #> [157] "and" "sensitivity" "analysis" "absent" #> [161] "prior" "information" "about" "the" #> [165] "measurement" "error" "process" "the" #> [169] "second" "set" "uses" "a" #> [173] "common" "error" "and" "reporting" #> [177] "model" "with" "different" "informative" #> [181] "prior" "distributions" "on" "its" #> [185] "parameters" "we" "construct" "these" #> [189] "informative" "prior" "distributions" "based" #> [193] "on" "the" "analysis" "of" #> [197] "the" "1993" "linked" "file" #> [201] "for" "all" "specifications" "considered" #> [205] "we" "create" "m" "50" #> [209] "multiple" "imputations" "of" "the" #> [213] "plausible" "true" "education" "values" #> [217] "in" "the" "2010" "acs" #> [221] "which" "we" "then" "analyze" #> [225] "using" "the" "methods" "of" #> [229] "rubin" "1987" "for" "all" #> [233] "specifications" "the" "true" "data" #> [237] "model" "is" "a" "saturated" #> [241] "multinomial" "distribution" "for" "the" #> [245] "five" "values" "of" "y" #> [249] "for" "each" "combination" "of" #> [253] "x" "we" "begin" "by" #> [257] "describing" "how" "we" "estimate" #> [261] "the" "parameters" "of" "the" #> [265] "true" "data" "distribution" "accounting" #> [269] "for" "the" "informative" "sampling" #> [273] "design" "of" "the" "nscg" #> [277] "4.1" "accounting" "for" "informative" #> [281] "sampling" "design" "of" "nscg" #> [285] "the" "2010" "nscg" "uses" #> [289] "reported" "education" "in" "the" #> [293] "2010" "acs" "as" "a" #> [297] "stratification" "variable" "fesco" "et" #> [301] "al" "2012" "finamore" "2013" #> [305] "its" "unweighted" "percentages" "can" #> [309] "over" "represent" "14" #> #> #> [[15]] #> [[15]][[1]] #> [1] "or" "under" "represent" "degree" "types" #> [6] "in" "the" "population" "this" "is" #> [11] "most" "obviously" "the" "case" "for" #> [16] "individuals" "without" "a" "college" "degree" #> [21] "yi" "5" "we" "need" "to" #> [26] "account" "for" "this" "informative" "sampling" #> [31] "when" "estimating" "parameters" "of" "the" #> [36] "true" "data" "model" "we" "do" #> [41] "so" "with" "a" "two" "stage" #> [46] "approach" "first" "we" "use" "survey" #> [51] "weighted" "inferences" "to" "estimate" "population" #> [56] "totals" "of" "y" "x" "from" #> [61] "the" "2010" "nscg" "second" "we" #> [66] "turn" "these" "estimates" "into" "an" #> [71] "approximate" "bayesian" "posterior" "distribution" "for" #> [76] "input" "to" "fitting" "the" "measurement" #> [81] "error" "models" "used" "to" "impute" #> [86] "plausible" "values" "of" "yi" "for" #> [91] "individuals" "in" "the" "acs" "we" #> [96] "now" "describe" "this" "process" "which" #> [101] "can" "be" "used" "generally" "when" #> [106] "dg" "is" "collected" "via" "a" #> [111] "complex" "survey" "design" "suppose" "for" #> [116] "the" "moment" "that" "dy" "dz" #> [121] "this" "is" "not" "the" "case" #> [126] "when" "de" "is" "the" "acs" #> [131] "where" "dz" "4" "and" "dg" #> [136] "is" "the" "nscg" "where" "dy" #> [141] "5" "however" "we" "start" "here" #> [146] "to" "fix" "ideas" "for" "all" #> [151] "possible" "combinations" "x" "let" "θxk" #> [156] "pr" "y" "k" "x" "x" #> [161] "and" "let" "θx" "θx1" "θxdy" #> [166] "we" "seek" "to" "use" "dg" #> [171] "to" "specify" "f" "θ" "x" #> [176] "y" "to" "do" "so" "we" #> [181] "first" "parameterize" "θxk" "txk" "dj" #> [186] "1" "py" "txj" "where" "txk" #> [191] "is" "the" "population" "count" "of" #> [196] "individuals" "with" "xi" "x" "yi" #> [201] "k" "we" "estimate" "tx" "tx1" #> [206] "txdy" "and" "the" "associated" "covariance" #> [211] "matrix" "of" "the" "estimator" "using" #> [216] "standard" "survey" "weighted" "estimation" "let" #> [221] "wi" "be" "the" "sample" "weight" #> [226] "for" "all" "i" "dg" "we" #> [231] "compute" "the" "estimated" "total" "and" #> [236] "associated" "variance" "for" "each" "x" #> [241] "and" "k" "as" "ng" "x" #> [246] "t̂xk" "wi" "i" "xi" "x" #> [251] "yi" "k" "6" "i" "1" #> [256] "n" "2" "g" "d" "t̂xk" #> [261] "ng" "x" "t̂xk" "var" "wi" #> [266] "i" "xi" "x" "yi" "k" #> [271] "7" "ng" "1" "i" "1" #> [276] "ng" "15" #> #> #> [[16]] #> [[16]][[1]] #> [1] "for" "each" "k" "and" "l" #> [6] "with" "l" "6" "k" "we" #> [11] "also" "compute" "the" "estimated" "covariance" #> [16] "n" "g" "d" "t̂xk" "t̂xl" #> [21] "ng" "t̂xk" "x" "cov" "wi" #> [26] "i" "xi" "x" "yi" "k" #> [31] "ng" "1" "i" "1" "ng" #> [36] "t̂xl" "wi" "i" "xi" "x" #> [41] "yi" "l" "8" "ng" "the" #> [46] "variance" "and" "covariance" "estimators" "are" #> [51] "the" "design" "based" "estimators" "for" #> [56] "probability" "proportional" "to" "size" "sampling" #> [61] "with" "replacement" "as" "is" "typical" #> [66] "of" "multi" "stage" "complex" "surveys" #> [71] "lohr" "2010" "switching" "now" "to" #> [76] "a" "bayesian" "modeling" "perspective" "we" #> [81] "assume" "that" "tx" "log" "normal" #> [86] "µx" "τx" "so" "as" "to" #> [91] "ensure" "a" "distribution" "with" "positive" #> [96] "values" "for" "all" "true" "totals" #> [101] "we" "select" "µx" "τx" "so" #> [106] "that" "each" "e" "txk" "t̂xk" #> [111] "and" "var" "tx" "σ̂" "t̂x" #> [116] "the" "estimated" "covariance" "matrix" "with" #> [121] "elements" "defined" "by" "7" "and" #> [126] "8" "these" "are" "derived" "from" #> [131] "moment" "matching" "tarmast" "2001" "we" #> [136] "have" "µxj" "log" "t̂xj" "τx" #> [141] "j" "j" "2" "9" "2" #> [146] "τx" "j" "j" "log" "1" #> [151] "σ̂x" "j" "j" "t̂xj" "10" #> [156] "τx" "j" "i" "log" "1" #> [161] "σ̂x" "j" "i" "t̂xj" "t̂xi" #> [166] "11" "where" "the" "notation" "j" #> [171] "i" "denotes" "an" "element" "in" #> [176] "row" "j" "and" "column" "i" #> [181] "of" "the" "matrix" "we" "draw" #> [186] "tx" "from" "this" "log" "normal" #> [191] "distribution" "and" "transform" "to" "draws" #> [196] "θx" "since" "the" "2010" "nscg" #> [201] "does" "not" "include" "individuals" "who" #> [206] "claim" "in" "the" "acs" "to" #> [211] "have" "less" "than" "a" "bachelor’s" #> [216] "degree" "we" "cannot" "use" "dg" #> [221] "directly" "to" "estimate" "tx5" "instead" #> [226] "we" "estimate" "tx" "tx1" "tx2" #> [231] "tx3" "tx4" "tx5" "using" "the" #> [236] "acs" "data" "and" "estimate" "tx1" #> [241] "tx2" "tx3" "tx4" "from" "the" #> [246] "nscg" "using" "the" "method" "described" #> [251] "previously" "this" "leads" "to" "an" #> [256] "estimate" "for" "tx5" "more" "precisely" #> [261] "let" "the" "acs" "design" "based" #> [266] "estimator" "for" "tx" "16" #> #> #> [[17]] #> [[17]][[1]] #> [1] "table" "2" "summary" "of" #> [5] "the" "first" "four" "measurement" #> [9] "error" "model" "specifications" "for" #> [13] "2010" "nscg" "acs" "analysis" #> [17] "these" "models" "use" "flat" #> [21] "prior" "distributions" "on" "all" #> [25] "parameters" "error" "model" "reporting" #> [29] "model" "expression" "for" "mit" #> [33] "β" "p" "r" "zi" #> [37] "yi" "k" "ei" "1" #> [41] "p4" "model" "1" "β1" #> [45] "k" "2" "βk" "i" #> [49] "yi" "k" "categorical" "pk" #> [53] "1" "pk" "4" "p4" #> [57] "m" "model" "2" "β1" #> [61] "k" "2" "βk" "i" #> [65] "yi" "k" "xi" "sex" #> [69] "m" "categorical" "pk" "1" #> [73] "pk" "4" "p4" "no" #> [77] "model" "3" "β1" "k" #> [81] "2" "βk" "i" "yi" #> [85] "k" "xi" "black" "no" #> [89] "categorical" "pk" "1" "pk" #> [93] "4" "p4" "yes" "k" #> [97] "1" "βk" "i" "yi" #> [101] "k" "xi" "black" "yes" #> [105] "p4" "m" "model" "4" #> [109] "β1" "k" "2" "βk" #> [113] "i" "yi" "k" "xi" #> [117] "sex" "m" "categorical" "pm" #> [121] "k" "1" "pm" "k" #> [125] "4" "if" "xi" "sex" #> [129] "m" "p4" "f" "k" #> [133] "1" "βk" "i" "yi" #> [137] "k" "xi" "sex" "f" #> [141] "categorical" "pf" "k" "1" #> [145] "pf" "k" "4" "if" #> [149] "xi" "sex" "f" "be" #> [153] "t̂x" "with" "design" "based" #> [157] "variance" "estimate" "σ̂" "2" #> [161] "t̂x" "we" "sample" "a" #> [165] "value" "tx" "normal" "t̂x" #> [169] "σ̂" "2" "t̂x" "using" #> [173] "an" "independent" "sample" "of" #> [177] "values" "of" "tx1" "tx4" #> [181] "from" "4j" "1" "txj" #> [185] "and" "set" "tx" "tx1" #> [189] "p" "the" "nscg" "we" #> [193] "compute" "tx5" "tx" "tx5" #> [197] "we" "repeat" "these" "steps" #> [201] "10,000" "times" "we" "then" #> [205] "compute" "the" "mean" "and" #> [209] "covariance" "matrix" "of" "the" #> [213] "10,000" "draws" "which" "we" #> [217] "again" "plug" "into" "9" #> [221] "11" "the" "resulting" "log" #> [225] "normal" "distri" "bution" "is" #> [229] "the" "approximate" "posterior" "distribution" #> [233] "of" "θx" "we" "include" #> [237] "an" "example" "of" "this" #> [241] "entire" "procedure" "in" "the" #> [245] "supplementary" "material" "4.2" "measurement" #> [249] "error" "models" "the" "two" #> [253] "sets" "of" "measurement" "error" #> [257] "models" "include" "four" "that" #> [261] "use" "flat" "prior" "distributions" #> [265] "and" "three" "that" "use" #> [269] "informative" "prior" "distributions" "based" #> [273] "on" "the" "1993" "linked" #> [277] "data" "for" "all" "error" #> [281] "models" "we" "use" "a" #> [285] "logistic" "regression" "of" "ei" #> [289] "on" "various" "main" "effects" #> [293] "and" "interactions" "of" "yi" #> [297] "and" "xi" "for" "all" #> [301] "reporting" "models" "we" "use" #> [305] "categorical" "distributions" "with" "probabilities" #> [309] "that" "depend" "on" "yi" #> [313] "and" "possibly" "xi" "the" #> [317] "four" "models" "with" "flat" #> [321] "prior" "distributions" "are" "summarized" #> [325] "in" "table" "2" "in" #> [329] "model" "1" "the" "error" #> [333] "and" "reporting" "models" "depend" #> [337] "only" "on" "17" #> #> #> [[18]] #> [[18]][[1]] #> [1] "table" "3" "summary" "of" #> [5] "informative" "prior" "specifications" "for" #> [9] "2010" "nscg" "acs" "analysis" #> [13] "for" "males" "with" "bachelor’s" #> [17] "degrees" "error" "rate" "reporting" #> [21] "probabilities" "pm" "1" "2" #> [25] "pm" "1" "3" "pm" #> [29] "1" "4" "model" "4" #> [33] "beta" "1" "1" "dirichlet" #> [37] "1" "1" "1" "model" #> [41] "5" "beta" "76" "14.24" #> [45] "dirichlet" "3.54" "1.27" "0.19" #> [49] "model" "6" "beta" "2724.2" #> [53] "50862" "dirichlet" "2235.3" "799.7" #> [57] "123.1" "model" "7" "beta" #> [61] "500" "99500" "dirichlet" "1" #> [65] "1" "1" "yi" "model" #> [69] "2" "and" "3" "keep" #> [73] "the" "reporting" "model" "as" #> [77] "in" "4" "but" "expand" #> [81] "the" "error" "model" "in" #> [85] "model" "2" "the" "probability" #> [89] "of" "a" "reporting" "error" #> [93] "can" "vary" "with" "yi" #> [97] "and" "sex" "xi" "sex" #> [101] "in" "model" "3" "error" #> [105] "probabilities" "can" "vary" "with" #> [109] "yi" "and" "the" "indicator" #> [113] "for" "black" "race" "xi" #> [117] "black" "in" "model" "4" #> [121] "the" "error" "and" "reporting" #> [125] "models" "both" "depend" "on" #> [129] "y" "and" "sex" "for" #> [133] "models" "5" "7" "we" #> [137] "use" "the" "specification" "in" #> [141] "model" "4" "and" "incorporate" #> [145] "prior" "in" "formation" "about" #> [149] "the" "measurement" "errors" "from" #> [153] "the" "1993" "linked" "data" #> [157] "in" "constructing" "the" "priors" #> [161] "we" "first" "remove" "records" #> [165] "that" "have" "been" "flagged" #> [169] "as" "having" "missing" "education" #> [173] "that" "has" "been" "imputed" #> [177] "because" "these" "imputations" "might" #> [181] "not" "closely" "reflect" "the" #> [185] "actual" "education" "values" "black" #> [189] "et" "al" "2003" "table" #> [193] "3" "displays" "the" "prior" #> [197] "distributions" "for" "males" "with" #> [201] "bachelor’s" "degrees" "details" "on" #> [205] "how" "we" "arrive" "at" #> [209] "these" "and" "other" "groups" #> [213] "prior" "specifications" "are" "in" #> [217] "the" "supplementary" "material" "here" #> [221] "we" "summarize" "briefly" "x" #> [225] "for" "model" "5" "we" #> [229] "set" "the" "prior" "distributions" #> [233] "for" "each" "βk" "so" #> [237] "that" "the" "error" "rates" #> [241] "are" "centered" "at" "the" #> [245] "estimate" "from" "the" "1993" #> [249] "linked" "data" "we" "also" #> [253] "require" "the" "central" "95" #> [257] "probability" "interval" "of" "the" #> [261] "prior" "distribution" "on" "each" #> [265] "error" "rate" "to" "be" #> [269] "close" "to" "005" "20" #> [273] "allowing" "for" "a" "wide" #> [277] "but" "not" "unrealistic" "range" #> [281] "of" "possible" "error" "rates" #> [285] "for" "the" "reporting" "probabilities" #> [289] "pm" "k" "z" "and" #> [293] "pf" "k" "z" "we" #> [297] "center" "most" "of" "the" #> [301] "prior" "distributions" "at" "the" #> [305] "corresponding" "estimates" "from" "the" #> [309] "1993" "linked" "data" "we" #> [313] "require" "the" "central" "95" #> [317] "probability" "interval" "of" "each" #> [321] "prior" "distribution" "to" "have" #> [325] "support" "on" "values" "of" #> [329] "p" "k" "z" "within" #> [333] "10" "of" "the" "1993" #> [337] "point" "estimate" "truncating" "at" #> [341] "zero" "or" "one" "as" #> [345] "needed" "one" "exception" "is" #> [349] "18" #> #> #> [[19]] #> [[19]][[1]] #> [1] "the" "reporting" "probabilities" "for" #> [5] "those" "with" "no" "college" #> [9] "degree" "who" "report" "professional" #> [13] "degree" "which" "we" "center" #> [17] "at" "half" "the" "1993" #> [21] "estimate" "the" "census" "bureau" #> [25] "has" "improved" "the" "clarity" #> [29] "of" "the" "definition" "of" #> [33] "professional" "in" "the" "20" #> [37] "years" "since" "the" "1990" #> [41] "long" "form" "as" "discussed" #> [45] "in" "the" "prior" "specification" #> [49] "section" "of" "the" "supplementary" #> [53] "material" "for" "model" "6" #> [57] "we" "use" "the" "same" #> [61] "prior" "means" "as" "in" #> [65] "model" "5" "for" "both" #> [69] "error" "and" "re" "porting" #> [73] "models" "however" "we" "substantially" #> [77] "tighten" "the" "prior" "distributions" #> [81] "to" "make" "the" "prior" #> [85] "variance" "accord" "with" "the" #> [89] "uncertainty" "in" "the" "point" #> [93] "estimates" "from" "the" "1993" #> [97] "linked" "data" "we" "do" #> [101] "so" "by" "using" "prior" #> [105] "sample" "sizes" "that" "match" #> [109] "those" "from" "the" "1993" #> [113] "nscg" "for" "example" "the" #> [117] "1993" "nscg" "included" "53,586" #> [121] "males" "with" "bachelor’s" "degrees" #> [125] "excluding" "those" "records" "who" #> [129] "had" "their" "census" "education" #> [133] "imputed" "we" "therefore" "use" #> [137] "beta" "2724.2" "50862" "as" #> [141] "the" "prior" "distribution" "for" #> [145] "the" "error" "rate" "for" #> [149] "this" "x" "we" "similarly" #> [153] "increase" "the" "prior" "sample" #> [157] "sizes" "for" "the" "reporting" #> [161] "probabilities" "to" "match" "the" #> [165] "1993" "nscg" "sample" "sizes" #> [169] "model" "7" "departs" "from" #> [173] "the" "1993" "linked" "data" #> [177] "estimates" "and" "encodes" "a" #> [181] "strong" "prior" "belief" "that" #> [185] "almost" "no" "one" "misreports" #> [189] "their" "education" "except" "for" #> [193] "haphazard" "mistakes" "here" "we" #> [197] "set" "the" "prior" "mean" #> [201] "for" "the" "probability" "of" #> [205] "misreporting" "education" "to" "005" #> [209] "for" "all" "demographic" "groups" #> [213] "we" "use" "a" "prior" #> [217] "sample" "size" "of" "100,000" #> [221] "making" "the" "prior" "distribution" #> [225] "concentrate" "strongly" "around" "005" #> [229] "for" "the" "reporting" "probabilities" #> [233] "we" "use" "a" "non" #> [237] "informative" "prior" "distribution" "for" #> [241] "convenience" "since" "the" "estimates" #> [245] "of" "the" "reporting" "probabilities" #> [249] "are" "strongly" "influenced" "by" #> [253] "the" "concentrated" "prior" "distributions" #> [257] "on" "the" "error" "rates" #> [261] "finally" "for" "comparison" "purposes" #> [265] "we" "also" "fit" "the" #> [269] "model" "based" "on" "a" #> [273] "conditional" "independence" "assumption" "cia" #> [277] "to" "impute" "yi" "for" #> [281] "individuals" "in" "the" "acs" #> [285] "under" "the" "cia" "we" #> [289] "sample" "θ" "and" "then" #> [293] "impute" "y" "θ" "x" #> [297] "from" "the" "true" "data" #> [301] "model" "here" "we" "do" #> [305] "not" "use" "the" "reported" #> [309] "value" "of" "zi" "in" #> [313] "the" "imputations" "19" #> #> #> [[20]] #> [[20]][[1]] #> [1] "4.3" "empirical" "results" "we" #> [5] "first" "examine" "what" "each" #> [9] "model" "suggests" "about" "the" #> [13] "extent" "and" "nature" "of" #> [17] "the" "mea" "surement" "errors" #> [21] "in" "the" "2010" "acs" #> [25] "we" "then" "use" "the" #> [29] "models" "to" "assess" "sensitivity" #> [33] "of" "results" "about" "the" #> [37] "substantive" "questions" "related" "to" #> [41] "number" "of" "degrees" "and" #> [45] "income" "4.3.1" "distributions" "of" #> [49] "errors" "in" "reported" "acs" #> [53] "education" "values" "table" "4" #> [57] "displays" "the" "multiple" "imputation" #> [61] "point" "estimates" "and" "95" #> [65] "confidence" "intervals" "for" "the" #> [69] "proportions" "of" "errors" "by" #> [73] "gender" "and" "nscg" "education" #> [77] "obtained" "from" "the" "m" #> [81] "50" "draws" "of" "ei" #> [85] "for" "all" "individuals" "in" #> [89] "de" "we" "begin" "by" #> [93] "comparing" "results" "for" "the" #> [97] "set" "of" "models" "with" #> [101] "flat" "prior" "distributions" "models" #> [105] "1" "4" "and" "the" #> [109] "cia" "model" "then" "move" #> [113] "to" "the" "set" "of" #> [117] "models" "with" "informative" "prior" #> [121] "distributions" "models" "5" "7" #> [125] "the" "cia" "model" "suggests" #> [129] "extremely" "large" "error" "percentages" #> [133] "especially" "for" "the" "highest" #> [137] "education" "levels" "these" "rates" #> [141] "seem" "unlikely" "to" "be" #> [145] "reality" "leading" "us" "to" #> [149] "reject" "the" "cia" "model" #> [153] "the" "overall" "error" "rates" #> [157] "for" "models" "1" "4" #> [161] "are" "similar" "and" "more" #> [165] "realistic" "than" "those" "from" #> [169] "the" "cia" "model" "the" #> [173] "differences" "in" "error" "estimates" #> [177] "between" "model" "2" "and" #> [181] "model" "1" "suggest" "that" #> [185] "the" "probability" "of" "error" #> [189] "depends" "on" "sex" "comparing" #> [193] "results" "for" "model" "3" #> [197] "and" "model" "1" "however" #> [201] "we" "see" "little" "evidence" #> [205] "of" "important" "race" "effects" #> [209] "on" "the" "propensity" "to" #> [213] "make" "errors" "model" "4" #> [217] "generalizes" "model" "2" "by" #> [221] "allowing" "the" "reporting" "probabilities" #> [225] "to" "vary" "by" "sex" #> [229] "if" "these" "probabilities" "were" #> [233] "similar" "across" "sex" "in" #> [237] "reality" "we" "would" "expect" #> [241] "the" "two" "models" "to" #> [245] "produce" "similar" "results" "however" #> [249] "the" "estimated" "error" "rates" #> [253] "are" "fairly" "different" "for" #> [257] "example" "the" "estimated" "proportion" #> [261] "of" "errors" "for" "female" #> [265] "professionals" "from" "model" "4" #> [269] "is" "about" "double" "that" #> [273] "from" "model" "2" "to" #> [277] "determine" "where" "the" "models" #> [281] "differ" "most" "we" "examine" #> [285] "the" "estimated" "reporting" "probabilities" #> [289] "displayed" "in" "table" "5" #> [293] "model" "4" "estimates" "some" #> [297] "significant" "differences" "in" "reporting" #> [301] "probabilities" "by" "gender" "for" #> [305] "example" "20" #> #> #> [[21]] #> [[21]][[1]] #> [1] "males" "with" "bachelor’s" "degrees" #> [5] "who" "make" "a" "reporting" #> [9] "error" "are" "estimated" "to" #> [13] "report" "a" "master’s" "degree" #> [17] "with" "probability" "96" "whereas" #> [21] "females" "with" "bachelor’s" "degrees" #> [25] "who" "make" "a" "reporting" #> [29] "error" "are" "estimated" "to" #> [33] "report" "a" "master’s" "degree" #> [37] "with" "probability" "67" "and" #> [41] "a" "professional" "degree" "with" #> [45] "probability" "30" "other" "large" #> [49] "differences" "exist" "for" "professional" #> [53] "degree" "holders" "females" "with" #> [57] "professional" "degrees" "who" "make" #> [61] "a" "reporting" "error" "are" #> [65] "most" "likely" "to" "report" #> [69] "a" "bachelor’s" "degree" "whereas" #> [73] "men" "with" "professional" "degrees" #> [77] "who" "make" "a" "reporting" #> [81] "error" "are" "most" "likely" #> [85] "to" "report" "a" "master’s" #> [89] "degree" "or" "ph" "d" #> [93] "we" "note" "that" "some" #> [97] "of" "the" "estimates" "for" #> [101] "model" "4" "are" "based" #> [105] "on" "small" "sample" "sizes" #> [109] "which" "explains" "the" "wide" #> [113] "standard" "errors" "turning" "to" #> [117] "models" "5" "7" "we" #> [121] "can" "see" "the" "impact" #> [125] "of" "the" "informative" "prior" #> [129] "distributions" "by" "comparing" "results" #> [133] "in" "table" "4" "under" #> [137] "these" "models" "to" "those" #> [141] "for" "model" "4" "moving" #> [145] "from" "model" "4" "to" #> [149] "model" "5" "the" "most" #> [153] "noticeable" "differences" "are" "for" #> [157] "women" "with" "a" "ph" #> [161] "d" "and" "men" "with" #> [165] "a" "master’s" "degree" "for" #> [169] "whom" "model" "5" "suggests" #> [173] "lower" "error" "rates" "these" #> [177] "groups" "have" "smaller" "sample" #> [181] "sizes" "so" "that" "the" #> [185] "data" "do" "not" "swamp" #> [189] "the" "effects" "of" "the" #> [193] "prior" "distribution" "when" "making" #> [197] "the" "prior" "sample" "sizes" #> [201] "very" "large" "as" "in" #> [205] "models" "6" "and" "7" #> [209] "the" "information" "in" "the" #> [213] "prior" "distribution" "tends" "to" #> [217] "overwhelm" "the" "information" "in" #> [221] "the" "data" "we" "provide" #> [225] "more" "thorough" "investigation" "of" #> [229] "the" "impact" "of" "the" #> [233] "prior" "specifications" "in" "the" #> [237] "supplementary" "material" "of" "course" #> [241] "we" "cannot" "be" "certain" #> [245] "which" "model" "most" "closely" #> [249] "reflects" "the" "true" "measure" #> [253] "ment" "error" "mechanism" "the" #> [257] "best" "we" "can" "do" #> [261] "is" "perform" "diagnostic" "tests" #> [265] "to" "see" "which" "models" #> [269] "if" "any" "should" "be" #> [273] "discounted" "as" "not" "adequately" #> [277] "describing" "the" "observed" "data" #> [281] "m" "for" "each" "acs" #> [285] "imputed" "dataset" "de" "under" #> [289] "each" "model" "we" "compute" #> [293] "the" "sample" "pro" "m" #> [297] "portions" "π̂xk" "and" "corresponding" #> [301] "multiple" "imputation" "95" "confidence" #> [305] "intervals" "for" "all" "165̇" #> [309] "unique" "values" "of" "x" #> [313] "y" "we" "determine" "how" #> [317] "many" "of" "the" "80" #> [321] "estimated" "population" "percentages" "of" #> [325] "y" "x" "computed" "from" #> [329] "the" "2010" "nscg" "using" #> [333] "the" "estimated" "t̂x" "from" #> [337] "the" "acs" "to" "back" #> [341] "into" "an" "estimate" "of" #> [345] "t̂x5" "fall" "within" "the" #> [349] "multiple" "imputation" "95" "21" #> #> #> [[22]] #> [[22]][[1]] #> [1] "confidence" "intervals" "models" "that" #> [5] "yield" "low" "rates" "do" #> [9] "not" "describe" "the" "data" #> [13] "accurately" "for" "model" "1" #> [17] "73" "of" "80" "nscg" #> [21] "population" "share" "estimates" "are" #> [25] "contained" "in" "the" "acs" #> [29] "multiple" "imputation" "intervals" "corresponding" #> [33] "counts" "are" "75" "for" #> [37] "model" "2" "71" "for" #> [41] "model" "3" "and" "76" #> [45] "for" "model" "4" "these" #> [49] "results" "suggest" "that" "model" #> [53] "1" "and" "model" "3" #> [57] "may" "be" "inferior" "to" #> [61] "model" "2" "and" "model" #> [65] "4" "for" "the" "models" #> [69] "with" "informative" "prior" "distributions" #> [73] "the" "counts" "are" "74" #> [77] "for" "model" "5" "67" #> [81] "for" "model" "6" "and" #> [85] "54" "for" "model" "7" #> [89] "although" "the" "prior" "beliefs" #> [93] "in" "models" "6" "and" #> [97] "7" "seem" "plausible" "at" #> [101] "first" "glance" "the" "diagnostic" #> [105] "suggests" "that" "they" "do" #> [109] "not" "describe" "the" "2010" #> [113] "data" "distributions" "as" "well" #> [117] "as" "models" "4" "and" #> [121] "5" "considering" "the" "results" #> [125] "as" "well" "as" "the" #> [129] "diagnostic" "check" "if" "we" #> [133] "had" "to" "choose" "one" #> [137] "model" "we" "would" "select" #> [141] "model" "5" "it" "seems" #> [145] "plausible" "that" "the" "probability" #> [149] "of" "misreporting" "education" "as" #> [153] "well" "as" "the" "reported" #> [157] "value" "itself" "when" "errors" #> [161] "are" "made" "depend" "on" #> [165] "both" "sex" "and" "true" #> [169] "education" "level" "additionally" "the" #> [173] "prior" "distribution" "from" "the" #> [177] "1993" "linked" "data" "pulls" #> [181] "estimates" "in" "groups" "with" #> [185] "little" "sample" "size" "to" #> [189] "measurement" "error" "distributions" "that" #> [193] "seem" "more" "plausible" "on" #> [197] "face" "value" "however" "one" #> [201] "need" "not" "use" "the" #> [205] "data" "fusion" "framework" "for" #> [209] "measurement" "error" "to" "select" #> [213] "a" "single" "model" "rather" #> [217] "one" "can" "use" "the" #> [221] "framework" "to" "examine" "sensitivity" #> [225] "of" "analyses" "to" "the" #> [229] "different" "specifications" "4.3.2" "sensitivity" #> [233] "analyses" "figure" "2" "displays" #> [237] "the" "multiply" "imputed" "survey" #> [241] "weighted" "inferences" "for" "the" #> [245] "total" "number" "of" "women" #> [249] "with" "science" "and" "engineering" #> [253] "degrees" "computing" "using" "the" #> [257] "acs" "specific" "indicator" "variable" #> [261] "we" "show" "results" "for" #> [265] "models" "4" "7" "the" #> [269] "cia" "model" "and" "based" #> [273] "on" "the" "acs" "data" #> [277] "without" "any" "adjustment" "for" #> [281] "misreporting" "education" "the" "confidence" #> [285] "intervals" "for" "model" "4" #> [289] "and" "model" "5" "overlap" #> [293] "substantially" "suggesting" "not" "much" #> [297] "practical" "difference" "in" "choosing" #> [301] "among" "these" "models" "however" #> [305] "both" "are" "noticeably" "different" #> [309] "from" "the" "other" "models" #> [313] "especially" "for" "the" "ph" #> [317] "d" "and" "professional" "degrees" #> [321] "as" "the" "prior" "distributions" #> [325] "on" "the" "error" "rates" #> [329] "get" "stronger" "the" "estimated" #> [333] "counts" "increase" "towards" "22" #> #> #> [[23]] #> [[23]][[1]] #> [1] "6" "x" "10" "bachelors" #> [5] "degree" "6" "x" "10" #> [9] "masters" "degree" "5.2" "2.6" #> [13] "acs" "cia" "model" "model" #> [17] "4" "5" "model" "5" #> [21] "2.5" "estimated" "total" "no" #> [25] "of" "sci" "and" "eng" #> [29] "degrees" "estimated" "total" "no" #> [33] "of" "sci" "and" "eng" #> [37] "degrees" "model" "6" "model" #> [41] "7" "4.8" "2.4" "4.6" #> [45] "2.3" "awarded" "to" "women" #> [49] "awarded" "to" "women" "4.4" #> [53] "2.2" "4.2" "2.1" "4" #> [57] "2" "acs" "cia" "m4" #> [61] "m5" "m6" "m7" "acs" #> [65] "cia" "m4" "m5" "m6" #> [69] "m7" "model" "model" "5" #> [73] "x" "10" "professional" "degree" #> [77] "x" "10" "5" "phd" #> [81] "degree" "7.5" "5" "7" #> [85] "4.5" "estimated" "total" "no" #> [89] "of" "sci" "and" "eng" #> [93] "degrees" "estimated" "total" "no" #> [97] "of" "sci" "and" "eng" #> [101] "degrees" "6.5" "4" "6" #> [105] "5.5" "3.5" "awarded" "to" #> [109] "women" "awarded" "to" "women" #> [113] "5" "3" "4.5" "2.5" #> [117] "4" "2" "3.5" "3" #> [121] "1.5" "acs" "cia" "m4" #> [125] "m5" "m6" "m7" "acs" #> [129] "cia" "m4" "m5" "m6" #> [133] "m7" "model" "model" "figure" #> [137] "2" "the" "estimated" "total" #> [141] "number" "of" "science" "and" #> [145] "engineering" "degrees" "awarded" "to" #> [149] "women" "under" "each" "model" #> [153] "we" "plot" "the" "mean" #> [157] "and" "95" "confidence" "intervals" #> [161] "note" "the" "difference" "in" #> [165] "scale" "for" "each" "degree" #> [169] "category" "the" "estimate" "using" #> [173] "the" "acs" "reported" "education" #> [177] "we" "note" "that" "using" #> [181] "the" "acs" "reported" "education" #> [185] "without" "adjustments" "results" "in" #> [189] "substantially" "higher" "estimated" "totals" #> [193] "at" "the" "professional" "and" #> [197] "ph" "d" "levels" "than" #> [201] "any" "of" "the" "models" #> [205] "that" "account" "for" "measurement" #> [209] "error" "we" "also" "note" #> [213] "that" "the" "cia" "model" #> [217] "yields" "considerably" "lower" "counts" #> [221] "for" "all" "but" "bachelor’s" #> [225] "degrees" "figure" "3" "displays" #> [229] "inferences" "for" "the" "average" #> [233] "income" "for" "different" "degrees" #> [237] "for" "most" "degrees" "the" #> [241] "point" "estimates" "for" "models" #> [245] "4" "7" "are" "reasonably" #> [249] "close" "with" "models" "4" #> [253] "23" #> #> #> [[24]] #> [[24]][[1]] #> [1] "and" "5" "again" #> [4] "giving" "similar" "results" #> [7] "the" "estimated" "average" #> [10] "income" "for" "professionals" #> [13] "differs" "noticeably" "across" #> [16] "models" "with" "model" #> [19] "4" "and" "model" #> [22] "5" "suggesting" "lower" #> [25] "averages" "than" "the" #> [28] "unadjusted" "acs" "estimates" #> [31] "or" "than" "models" #> [34] "6" "and" "7" #> [37] "we" "note" "that" #> [40] "the" "cia" "model" #> [43] "estimates" "are" "clearly" #> [46] "implausible" "as" "an" #> [49] "independent" "check" "on" #> [52] "these" "estimates" "we" #> [55] "considered" "the" "estimated" #> [58] "average" "earnings" "in" #> [61] "the" "2010" "current" #> [64] "population" "survey" "they" #> [67] "are" "83,720" "for" #> [70] "professional" "80,600" "for" #> [73] "ph" "d" "degree" #> [76] "66,144" "for" "master’s" #> [79] "degree" "and" "53,976" #> [82] "for" "bachelor’s" "degree" #> [85] "http" "www.collegequest.com" "bls" #> [88] "research" "education" "pays" #> [91] "2010" "aspx" "these" #> [94] "line" "up" "more" #> [97] "closely" "with" "the" #> [100] "estimates" "from" "model" #> [103] "5" "than" "any" #> [106] "other" "model" "especially" #> [109] "for" "the" "professional" #> [112] "degree" "category" "where" #> [115] "the" "estimates" "most" #> [118] "differ" "figure" "4" #> [121] "displays" "inferences" "for" #> [124] "the" "average" "income" #> [127] "for" "men" "and" #> [130] "women" "all" "models" #> [133] "support" "the" "conclusion" #> [136] "that" "men" "make" #> [139] "more" "than" "women" #> [142] "apparently" "misreporting" "in" #> [145] "education" "does" "not" #> [148] "account" "for" "that" #> [151] "gap" "at" "least" #> [154] "for" "the" "models" #> [157] "considered" "here" "we" #> [160] "note" "that" "model" #> [163] "4" "suggests" "potentially" #> [166] "larger" "income" "gaps" #> [169] "between" "male" "and" #> [172] "female" "ph" "d" #> [175] "recipients" "than" "the" #> [178] "other" "models" "5" #> [181] "concluding" "remarks" "the" #> [184] "framework" "presented" "in" #> [187] "this" "article" "offers" #> [190] "analysts" "tools" "for" #> [193] "using" "the" "information" #> [196] "in" "a" "high" #> [199] "quality" "separate" "data" #> [202] "source" "to" "adjust" #> [205] "for" "measurement" "errors" #> [208] "in" "the" "database" #> [211] "of" "interest" "key" #> [214] "to" "the" "framework" #> [217] "is" "to" "replace" #> [220] "conditional" "independence" "assumptions" #> [223] "typically" "used" "in" #> [226] "data" "fusion" "with" #> [229] "carefully" "considered" "measurement" #> [232] "error" "models" "this" #> [235] "avoids" "sacrificing" "information" #> [238] "and" "facilitates" "analysis" #> [241] "of" "the" "sensitivity" #> [244] "of" "conclusions" "to" #> [247] "alternative" "measurement" "error" #> [250] "specifications" "analysts" "can" #> [253] "use" "diagnostic" "tests" #> [256] "to" "rule" "out" #> [259] "some" "measurement" "error" #> [262] "models" "and" "perform" #> [265] "sensibility" "tests" "on" #> [268] "others" "to" "identify" #> [271] "reasonable" "candidates" "24" #> #> #> [[25]] #> [[25]][[1]] #> [1] "4" "x" "10" "10" #> [5] "acs" "cia" "model" "model" #> [9] "4" "9" "model" "5" #> [13] "model" "6" "model" "7" #> [17] "8" "estimated" "average" "income" #> [21] "7" "6" "5" "4" #> [25] "3" "ba" "ma" "prof" #> [29] "phd" "none" "education" "level" #> [33] "figure" "3" "multiple" "imputation" #> [37] "point" "and" "95" "confidence" #> [41] "interval" "estimates" "for" "the" #> [45] "average" "income" "within" "each" #> [49] "education" "level" "the" "acs" #> [53] "estimate" "is" "the" "survey" #> [57] "weighted" "estimate" "based" "on" #> [61] "the" "reported" "education" "level" #> [65] "in" "the" "2010" "acs" #> [69] "besides" "survey" "sampling" "contexts" #> [73] "like" "the" "one" "considered" #> [77] "here" "involving" "the" "acs" #> [81] "and" "nscg" "the" "framework" #> [85] "offers" "potential" "approaches" "for" #> [89] "dealing" "with" "possible" "mea" #> [93] "surement" "errors" "in" "organic" #> [97] "big" "data" "this" "is" #> [101] "increasingly" "important" "as" "data" #> [105] "stewards" "and" "analysts" "consider" #> [109] "replacing" "or" "supplementing" "high" #> [113] "quality" "but" "expensive" "surveys" #> [117] "with" "inexpensive" "and" "large" #> [121] "sample" "organic" "data" "often" #> [125] "scant" "attention" "is" "paid" #> [129] "to" "the" "potential" "impact" #> [133] "of" "measurement" "errors" "on" #> [137] "inferences" "from" "those" "data" #> [141] "the" "framework" "could" "be" #> [145] "used" "with" "high" "quality" #> [149] "validated" "surveys" "as" "the" #> [153] "gold" "standard" "data" "allowing" #> [157] "for" "adjustments" "to" "the" #> [161] "error" "prone" "organic" "data" #> [165] "25" #> #> #> [[26]] #> [[26]][[1]] #> [1] "4" "x" #> [3] "10" "bachelors" #> [5] "degree" "4" #> [7] "x" "10" #> [9] "masters" "degree" #> [11] "7" "8" #> [13] "male" "small" #> [15] "marker" "female" #> [17] "large" "marker" #> [19] "6.5" "7.5" #> [21] "7" "estimated" #> [23] "average" "income" #> [25] "by" "gender" #> [27] "estimated" "average" #> [29] "income" "by" #> [31] "gender" "6" #> [33] "6.5" "5.5" #> [35] "6" "5" #> [37] "5.5" "4.5" #> [39] "5" "4" #> [41] "4.5" "3.5" #> [43] "4" "3" #> [45] "3.5" "acs" #> [47] "cia" "m4" #> [49] "m5" "m6" #> [51] "m7" "acs" #> [53] "cia" "m4" #> [55] "m5" "m6" #> [57] "m7" "model" #> [59] "specification" "model" #> [61] "specification" "4" #> [63] "x" "10" #> [65] "professional" "degree" #> [67] "x" "10" #> [69] "4" "phd" #> [71] "degree" "12" #> [73] "10" "11" #> [75] "9" "10" #> [77] "estimated" "average" #> [79] "income" "by" #> [81] "gender" "estimated" #> [83] "average" "income" #> [85] "by" "gender" #> [87] "8" "9" #> [89] "8" "7" #> [91] "7" "6" #> [93] "6" "5" #> [95] "5" "4" #> [97] "4" "3" #> [99] "3" "acs" #> [101] "cia" "m4" #> [103] "m5" "m6" #> [105] "m7" "acs" #> [107] "cia" "m4" #> [109] "m5" "m6" #> [111] "m7" "model" #> [113] "specification" "model" #> [115] "specification" "figure" #> [117] "4" "multiple" #> [119] "imputation" "point" #> [121] "and" "95" #> [123] "confidence" "interval" #> [125] "estimates" "for" #> [127] "the" "average" #> [129] "income" "for" #> [131] "men" "and" #> [133] "women" "within" #> [135] "each" "education" #> [137] "level" "the" #> [139] "acs" "estimate" #> [141] "is" "the" #> [143] "survey" "weighted" #> [145] "estimate" "based" #> [147] "on" "the" #> [149] "reported" "education" #> [151] "level" "in" #> [153] "the" "2010" #> [155] "acs" "supplementary" #> [157] "materials" "all" #> [159] "supplemental" "files" #> [161] "listed" "below" #> [163] "are" "contained" #> [165] "in" "a" #> [167] "single" "zip" #> [169] "file" "supplementary.zip" #> [171] "and" "can" #> [173] "be" "obtained" #> [175] "via" "a" #> [177] "single" "download" #> [179] "supplementary" "results" #> [181] "supplementary" "details" #> [183] "and" "additional" #> [185] "results" "for" #> [187] "paper" "supp" #> [189] "material" "final.pdf" #> [191] "acs" "data" #> [193] "2010" "acs" #> [195] "data" "used" #> [197] "in" "the" #> [199] "paper" "acsdata" #> [201] "2010standardized.csv.zip" "26" #> #> #> [[27]] #> [[27]][[1]] #> [1] "matlab" "code" "matlab" #> [4] "files" "containing" "main" #> [7] "code" "maincode" "edu" #> [10] "2010app" "report1993" "m" #> [13] "and" "helper" "functions" #> [16] "design.m" "and" "dirsamp.m" #> [19] "as" "well" "as" #> [22] "parameter" "files" "mu.mat" #> [25] "and" "tauspd.mat" "code.zip" #> [28] "prior" "distributions" "csv" #> [31] "files" "are" "provided" #> [34] "for" "priors" "used" #> [37] "in" "model" "5" #> [40] "and" "read" "in" #> [43] "by" "main" "matlab" #> [46] "code" "referred" "to" #> [49] "as" "femalereportprior1993" "csv" #> [52] "malereport" "prior1993" "csv" #> [55] "betareportprior.csv" "priors.zip" "references" #> [58] "abayomi" "k" "gelman" #> [61] "a" "and" "levy" #> [64] "m" "2008" "diagnostics" #> [67] "for" "multivariate" "impu" #> [70] "tations" "journal" "of" #> [73] "the" "royal" "statistical" #> [76] "society" "series" "c" #> [79] "applied" "statistics" "57" #> [82] "273" "291" "black" #> [85] "d" "haviland" "a" #> [88] "sanders" "s" "and" #> [91] "taylor" "l" "2006" #> [94] "why" "do" "minority" #> [97] "men" "earn" "less" #> [100] "a" "study" "of" #> [103] "wage" "differentials" "among" #> [106] "the" "highly" "educated" #> [109] "the" "review" "of" #> [112] "economics" "and" "statistics" #> [115] "88" "300" "313" #> [118] "black" "d" "sanders" #> [121] "s" "and" "taylor" #> [124] "l" "2003" "measurement" #> [127] "of" "higher" "education" #> [130] "in" "the" "census" #> [133] "and" "current" "population" #> [136] "survey" "journal" "of" #> [139] "the" "american" "statistical" #> [142] "association" "98" "545" #> [145] "554" "black" "d" #> [148] "a" "haviland" "a" #> [151] "m" "sanders" "s" #> [154] "g" "and" "taylor" #> [157] "l" "j" "2008" #> [160] "gender" "wage" "disparities" #> [163] "among" "the" "highly" #> [166] "educated" "journal" "of" #> [169] "human" "resources" "43" #> [172] "630" "659" "carrig" #> [175] "m" "manrique" "vallier" #> [178] "d" "ranby" "k" #> [181] "reiter" "j" "p" #> [184] "and" "hoyle" "r" #> [187] "2015" "a" "multiple" #> [190] "imputation" "based" "method" #> [193] "for" "the" "retrospective" #> [196] "harmonization" "of" "data" #> [199] "sets" "multivariate" "behavioral" #> [202] "research" "50" "383" #> [205] "397" "curran" "p" #> [208] "j" "and" "hussong" #> [211] "a" "m" "2009" #> [214] "integrative" "data" "analysis" #> [217] "the" "simultaneous" "analysis" #> [220] "of" "multiple" "data" #> [223] "sets" "psychological" "methods" #> [226] "14" "81" "100" #> [229] "d’orazio" "m" "di" #> [232] "zio" "m" "and" #> [235] "scanu" "m" "2006" #> [238] "statistical" "matching" "theory" #> [241] "and" "practice" "hoboken" #> [244] "nj" "wiley" "dunson" #> [247] "d" "b" "and" #> [250] "xing" "c" "2009" #> [253] "nonparametric" "bayes" "modeling" #> [256] "of" "multivariate" "categorical" #> [259] "data" "journal" "of" #> [262] "the" "american" "statistical" #> [265] "association" "104" "1042" #> [268] "1051" "fesco" "r" #> [271] "s" "frase" "m" #> [274] "j" "and" "kannankutty" #> [277] "n" "2012" "using" #> [280] "the" "american" "commu" #> [283] "nity" "survey" "as" #> [286] "the" "sampling" "frame" #> [289] "for" "the" "national" #> [292] "survey" "of" "college" #> [295] "graduates" "working" "paper" #> [298] "ncses" "12" "201" #> [301] "national" "science" "foundation" #> [304] "national" "center" "for" #> [307] "science" "and" "engineering" #> [310] "statistics" "arlington" "va" #> [313] "27" #> #> #> [[28]] #> [[28]][[1]] #> [1] "finamore" "j" "2013" "national" #> [5] "survey" "of" "college" "graduates" #> [9] "about" "the" "survey" "na" #> [13] "tional" "center" "for" "science" #> [17] "and" "engineering" "statistics" "fosdick" #> [21] "b" "k" "deyoreo" "m" #> [25] "and" "reiter" "j" "p" #> [29] "2016" "categorical" "data" "fusion" #> [33] "using" "auxiliary" "information" "annals" #> [37] "of" "applied" "statistics" "to" #> [41] "appear" "he" "y" "landrum" #> [45] "m" "b" "and" "zaslavksy" #> [49] "a" "m" "2014" "combining" #> [53] "information" "from" "two" "data" #> [57] "sources" "with" "misreporting" "and" #> [61] "incompleteness" "to" "assess" "hospice" #> [65] "use" "among" "cancer" "patients" #> [69] "a" "multiple" "imputation" "appraoch" #> [73] "statistics" "in" "medicine" "33" #> [77] "3710" "3724" "hirano" "k" #> [81] "imbens" "g" "ridder" "g" #> [85] "and" "rubin" "d" "2001" #> [89] "combining" "panel" "data" "sets" #> [93] "with" "attrition" "and" "refreshment" #> [97] "samples" "econometrica" "69" "1645" #> [101] "1659" "kim" "h" "j" #> [105] "cox" "l" "h" "karr" #> [109] "a" "f" "reiter" "j" #> [113] "p" "and" "wang" "q" #> [117] "2015" "simultane" "ous" "edit" #> [121] "imputation" "for" "continuous" "microdata" #> [125] "journal" "of" "the" "american" #> [129] "statistical" "association" "110" "987" #> [133] "999" "lohr" "s" "l" #> [137] "2010" "sampling" "design" "and" #> [141] "analysis" "boston" "ma" "brooks" #> [145] "cole" "2nd" "ed" "manrique" #> [149] "vallier" "d" "and" "reiter" #> [153] "j" "p" "2016" "bayesian" #> [157] "simultaneous" "edit" "and" "impu" #> [161] "tation" "for" "multivariate" "categorical" #> [165] "data" "journal" "of" "the" #> [169] "american" "statistical" "asso" "ciation" #> [173] "to" "appear" "moriarity" "c" #> [177] "and" "scheuren" "f" "2001" #> [181] "statistical" "matching" "a" "paradigm" #> [185] "for" "assessing" "the" "uncertainty" #> [189] "in" "the" "procedure" "journal" #> [193] "of" "official" "statistics" "17" #> [197] "407" "422" "national" "science" #> [201] "foundation" "1993" "national" "survey" #> [205] "of" "college" "graduates" "1993" #> [209] "http" "doi.org" "10.3886" "icpsr06880" #> [213] "v1" "icpsr06880" "v1" "ann" #> [217] "arbor" "mi" "inter" "university" #> [221] "consortium" "for" "political" "and" #> [225] "social" "research" "distributor" "2014" #> [229] "10" "02" "pepe" "m" #> [233] "s" "1992" "inference" "using" #> [237] "surrogate" "outcome" "data" "and" #> [241] "a" "validation" "sample" "biometrika" #> [245] "79" "355" "365" "raghunathan" #> [249] "t" "e" "2006" "combining" #> [253] "information" "from" "multiple" "surveys" #> [257] "for" "assess" "ing" "health" #> [261] "disparities" "allgemeines" "statistisches" "archiv" #> [265] "90" "515" "526" "rassler" #> [269] "s" "2002" "statistical" "matching" #> [273] "new" "york" "springer" "reiter" #> [277] "j" "2008" "multiple" "imputation" #> [281] "when" "records" "used" "for" #> [285] "imputation" "are" "not" "used" #> [289] "or" "disseminated" "for" "analysis" #> [293] "biometrika" "95" "933" "946" #> [297] "reiter" "j" "p" "2012" #> [301] "bayesian" "finite" "population" "imputation" #> [305] "for" "data" "fusion" "statistica" #> [309] "sinica" "22" "795" "811" #> [313] "rubin" "d" "b" "1986" #> [317] "statistical" "matching" "using" "file" #> [321] "concatenation" "with" "adjusted" "weights" #> [325] "and" "multiple" "imputations" "journal" #> [329] "of" "business" "economic" "statistics" #> [333] "4" "87" "94" "28" #> #> #> [[29]] #> [[29]][[1]] #> [1] "1987" "multiple" "imputation" "for" #> [5] "nonresponse" "in" "surveys" "new" #> [9] "york" "john" "wiley" "sons" #> [13] "schenker" "n" "and" "raghunathan" #> [17] "t" "e" "2007" "combining" #> [21] "information" "from" "multiple" "surveys" #> [25] "to" "enhance" "estimation" "of" #> [29] "measures" "of" "health" "statistics" #> [33] "in" "medicine" "26" "1802" #> [37] "1811" "schenker" "n" "raghunathan" #> [41] "t" "e" "and" "bondarenko" #> [45] "i" "2010" "improving" "on" #> [49] "analyses" "of" "self" "reported" #> [53] "data" "in" "a" "large" #> [57] "scale" "health" "survey" "by" #> [61] "using" "information" "from" "an" #> [65] "examination" "based" "survey" "statistics" #> [69] "in" "medicine" "29" "533" #> [73] "545" "schifeling" "t" "a" #> [77] "cheng" "c" "reiter" "j" #> [81] "p" "and" "hillygus" "d" #> [85] "s" "2015" "accounting" "for" #> [89] "nonignorable" "unit" "nonresponse" "and" #> [93] "attrition" "in" "panel" "studies" #> [97] "with" "refreshment" "samples" "journal" #> [101] "of" "survey" "statistics" "and" #> [105] "methodology" "3" "265" "295" #> [109] "si" "y" "and" "reiter" #> [113] "j" "2013" "nonparametric" "bayesian" #> [117] "multiple" "imputation" "for" "incom" #> [121] "plete" "categorical" "variables" "in" #> [125] "large" "scale" "assessment" "surveys" #> [129] "journal" "of" "educational" "and" #> [133] "behavioral" "statistics" "38" "499" #> [137] "521" "si" "y" "reiter" #> [141] "j" "p" "and" "hillygus" #> [145] "d" "s" "2015" "semi" #> [149] "parametric" "selection" "models" "for" #> [153] "potentially" "non" "ignorable" "attrition" #> [157] "in" "panel" "studies" "with" #> [161] "refreshment" "samples" "political" "analysis" #> [165] "23" "92" "112" "siddique" #> [169] "j" "reiter" "j" "p" #> [173] "brincks" "a" "gibbons" "r" #> [177] "d" "crespi" "c" "m" #> [181] "and" "brown" "c" "h" #> [185] "2015" "multiple" "imputation" "for" #> [189] "harmonizing" "longitudinal" "non" "commensurate" #> [193] "measures" "in" "individual" "participant" #> [197] "data" "meta" "analysis" "statistics" #> [201] "in" "medicine" "34" "3399" #> [205] "3414" "tarmast" "g" "2001" #> [209] "multivariate" "log" "normal" "distribution" #> [213] "in" "international" "statistical" "institute" #> [217] "seoul" "53rd" "session" "yucel" #> [221] "r" "m" "and" "zaslavsky" #> [225] "a" "m" "2005" "imputation" #> [229] "of" "binary" "treatment" "variables" #> [233] "with" "measurement" "error" "in" #> [237] "administrative" "data" "journal" "of" #> [241] "the" "american" "statistical" "association" #> [245] "100" "1123" "1132" "29" #> #> #> [[30]] #> [[30]][[1]] #> [1] "table" "4" "error" "rate" #> [5] "estimates" "from" "different" "model" #> [9] "specifications" "models" "1" "7" #> [13] "are" "run" "for" "100,000" #> [17] "mcmc" "iterations" "we" "save" #> [21] "m" "50" "completed" "datasets" #> [25] "under" "each" "model" "for" #> [29] "each" "dataset" "we" "compute" #> [33] "the" "estimated" "overall" "error" #> [37] "rate" "estimated" "error" "rate" #> [41] "by" "gender" "and" "imputed" #> [45] "y" "and" "associated" "variances" #> [49] "using" "ratio" "estimators" "that" #> [53] "incorporate" "the" "acs" "final" #> [57] "survey" "weights" "estimate" "estimate" #> [61] "by" "group" "overall" "y" #> [65] "ba" "y" "ma" "y" #> [69] "prof" "y" "phd" "cia" #> [73] "model" "male" "37" "36" #> [77] "37" "76" "75" "76" #> [81] "91" "91" "92" "94" #> [85] "93" "95" "57" "55" #> [89] "58" "female" "35" "35" #> [93] "36" "72" "71" "72" #> [97] "95" "94" "95" "97" #> [101] "96" "97" "model" "1" #> [105] "male" "05" "04" "06" #> [109] "10" "08" "11" "18" #> [113] "15" "21" "27" "23" #> [117] "31" "17" "16" "19" #> [121] "female" "05" "05" "06" #> [125] "09" "08" "10" "18" #> [129] "15" "21" "28" "24" #> [133] "32" "model" "2" "male" #> [137] "05" "04" "06" "18" #> [141] "16" "21" "27" "18" #> [145] "37" "36" "30" "42" #> [149] "20" "18" "21" "female" #> [153] "05" "05" "06" "12" #> [157] "10" "14" "26" "20" #> [161] "33" "41" "29" "53" #> [165] "model" "3" "male" "05" #> [169] "04" "06" "09" "08" #> [173] "11" "17" "14" "20" #> [177] "25" "21" "30" "17" #> [181] "16" "19" "female" "05" #> [185] "05" "06" "09" "08" #> [189] "10" "17" "14" "20" #> [193] "26" "21" "31" "model" #> [197] "4" "male" "05" "04" #> [201] "06" "19" "16" "23" #> [205] "36" "26" "46" "36" #> [209] "27" "45" "22" "20" #> [213] "24" "female" "09" "08" #> [217] "10" "14" "11" "17" #> [221] "52" "44" "59" "55" #> [225] "40" "70" "model" "5" #> [229] "male" "07" "06" "08" #> [233] "19" "16" "22" "23" #> [237] "14" "32" "34" "27" #> [241] "41" "22" "20" "24" #> [245] "female" "09" "08" "10" #> [249] "12" "09" "15" "50" #> [253] "43" "57" "31" "17" #> [257] "46" "model" "6" "male" #> [261] "05" "05" "05" "09" #> [265] "08" "10" "10" "09" #> [269] "11" "10" "09" "11" #> [273] "16" "14" "17" "female" #> [277] "05" "04" "05" "06" #> [281] "05" "07" "16" "14" #> [285] "18" "07" "06" "09" #> [289] "model" "7" "male" "01" #> [293] "01" "01" "01" "00" #> [297] "01" "00" "00" "01" #> [301] "01" "00" "01" "11" #> [305] "09" "13" "female" "01" #> [309] "01" "01" "01" "01" #> [313] "01" "01" "00" "01" #> [317] "01" "00" "01" "30" #> #> #> [[31]] #> [[31]][[1]] #> [1] "table" "5" "estimated" "mean" #> [5] "and" "95" "confidence" "interval" #> [9] "of" "reporting" "probabilities" "under" #> [13] "model" "2" "and" "reporting" #> [17] "probabilities" "by" "gender" "under" #> [21] "model" "4" "z" "ba" #> [25] "z" "ma" "z" "prof" #> [29] "z" "phd" "y" "ba" #> [33] "model" "2" "95" "87" #> [37] "1.00" "04" "00" "11" #> [41] "01" "00" "03" "model" #> [45] "4" "male" "96" "90" #> [49] "1.00" "02" "00" "07" #> [53] "02" "00" "05" "model" #> [57] "4" "female" "67" "58" #> [61] "76" "30" "22" "38" #> [65] "03" "00" "07" "y" #> [69] "ma" "model" "2" "02" #> [73] "00" "06" "51" "43" #> [77] "59" "47" "39" "55" #> [81] "model" "4" "male" "04" #> [85] "00" "11" "57" "48" #> [89] "66" "39" "31" "47" #> [93] "model" "4" "female" "11" #> [97] "00" "25" "39" "26" #> [101] "52" "50" "40" "61" #> [105] "y" "prof" "model" "2" #> [109] "05" "00" "16" "69" #> [113] "54" "83" "26" "14" #> [117] "38" "model" "4" "male" #> [121] "02" "00" "06" "69" #> [125] "44" "94" "29" "04" #> [129] "54" "model" "4" "female" #> [133] "91" "79" "1.00" "06" #> [137] "00" "16" "04" "00" #> [141] "10" "y" "phd" "model" #> [145] "2" "01" "00" "04" #> [149] "39" "15" "63" "60" #> [153] "36" "83" "model" "4" #> [157] "male" "01" "00" "05" #> [161] "21" "02" "39" "78" #> [165] "60" "96" "model" "4" #> [169] "female" "10" "00" "30" #> [173] "77" "50" "1.00" "13" #> [177] "00" "34" "y" "none" #> [181] "model" "2" "95" "95" #> [185] "96" "03" "03" "04" #> [189] "01" "01" "01" "00" #> [193] "00" "00" "model" "4" #> [197] "male" "97" "96" "97" #> [201] "03" "02" "03" "01" #> [205] "00" "01" "00" "00" #> [209] "00" "model" "4" "female" #> [213] "96" "95" "97" "04" #> [217] "03" "05" "00" "00" #> [221] "00" "00" "00" "00" #> [225] "31" #> #>