DEVELOPMENT... { "data_id": "44792", "name": "KDDCup09-Upselling_seed_4_nrows_2000_nclasses_10_ncols_100_stratify_True", "exact_name": "KDDCup09-Upselling_seed_4_nrows_2000_nclasses_10_ncols_100_stratify_True", "version": 1, "version_label": "409f6313-51ce-421b-b48f-223b0469b7b4", "description": "Subsampling of the dataset KDDCup09-Upselling (43072) with\n\nseed=4\nargs.nrows=2000\nargs.ncols=100\nargs.nclasses=10\nargs.no_stratify=True\nGenerated with the following source code:\n\n\n```python\n def subsample(\n self,\n seed: int,\n nrows_max: int = 2_000,\n ncols_max: int = 100,\n nclasses_max: int = 10,\n stratified: bool = True,\n ) -> Dataset:\n rng = np.random.default_rng(seed)\n\n x = self.x\n y = self.y\n\n # Uniformly sample\n classes = y.unique()\n if len(classes) > nclasses_max:\n vcs = y.value_counts()\n selected_classes = rng.choice(\n classes,\n size=nclasses_max,\n replace=False,\n p=vcs \/ sum(vcs),\n )\n\n # Select the indices where one of these classes is present\n idxs = y.index[y.isin(classes)]\n x = x.iloc[idxs]\n y = y.iloc[idxs]\n\n # Uniformly sample columns if required\n if len(x.columns) > ncols_max:\n columns_idxs = rng.choice(\n list(range(len(x.columns))), size=ncols_max, replace=False\n )\n sorted_column_idxs = sorted(columns_idxs)\n selected_columns = list(x.columns[sorted_column_idxs])\n x = x[selected_columns]\n else:\n sorted_column_idxs = list(range(len(x.columns)))\n\n if len(x) > nrows_max:\n # Stratify accordingly\n target_name = y.name\n data = pd.concat((x, y), axis=\"columns\")\n _, subset = train_test_split(\n data,\n test_size=nrows_max,\n stratify=data[target_name],\n shuffle=True,\n random_state=seed,\n )\n x = subset.drop(target_name, axis=\"columns\")\n y = subset[target_name]\n\n # We need to convert categorical columns to string for openml\n categorical_mask = [self.categorical_mask[i] for i in sorted_column_idxs]\n columns = list(x.columns)\n\n return Dataset(\n # Technically this is not the same but it's where it was derived from\n dataset=self.dataset,\n x=x,\n y=y,\n categorical_mask=categorical_mask,\n columns=columns,\n )\n```", "format": "arff", "uploader": "David Wilson", "uploader_id": 32840, "visibility": "public", "creator": "\"Eddie Bergman\"", "contributor": null, "date": "2022-11-17 19:06:00", "update_comment": null, "last_update": "2022-11-17 19:06:00", "licence": "Public", "status": "active", "error_message": null, "url": "https:\/\/api.openml.org\/data\/download\/22111554\/dataset", "default_target_attribute": "upselling", "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "KDDCup09-Upselling_seed_4_nrows_2000_nclasses_10_ncols_100_stratify_True", "Subsampling of the dataset KDDCup09-Upselling (43072) with seed=4 args.nrows=2000 args.ncols=100 args.nclasses=10 args.no_stratify=True Generated with the following source code: ```python def subsample( self, seed: int, nrows_max: int = 2_000, ncols_max: int = 100, nclasses_max: int = 10, stratified: bool = True, ) -> Dataset: rng = np.random.default_rng(seed) x = self.x y = self.y # Uniformly sample classes = y.unique() if len(classes) > nclasses_max: vcs = y.value_counts() selected_classes = r " ], "weight": 5 }, "qualities": { "NumberOfInstances": 2000, "NumberOfFeatures": 101, "NumberOfClasses": 2, "NumberOfMissingValues": 9426, "NumberOfInstancesWithMissingValues": 2000, "NumberOfNumericFeatures": 98, "NumberOfSymbolicFeatures": 3, "PercentageOfSymbolicFeatures": 2.9702970297029703, "AutoCorrelation": 0.8599299649824912, "PercentageOfNumericFeatures": 97.02970297029702, "PercentageOfMissingValues": 4.6663366336633665, "PercentageOfInstancesWithMissingValues": 100, "PercentageOfBinaryFeatures": 0.9900990099009901, "NumberOfBinaryFeatures": 1, "MinorityClassSize": 147, "MinorityClassPercentage": 7.35, "MajorityClassSize": 1853, "MajorityClassPercentage": 92.65, "Dimensionality": 0.0505 }, "tags": [], "features": [ { "name": "upselling", "index": "100", "type": "nominal", "distinct": "2", "missing": "0", "target": "1", "distr": [ [ "-1", "1" ], [ [ "1853", "0" ], [ "0", "147" ] ] ] }, { "name": "Var403", "index": "0", "type": "numeric", "distinct": "3", "missing": "0", "min": "0", "max": "266", "mean": "0", "stdev": "6" }, { "name": "Var520", "index": "1", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "2", "mean": "0", "stdev": "0" }, { "name": "Var900", "index": "2", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "4", "mean": "0", "stdev": "0" }, { "name": "Var1181", "index": "3", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "45", "mean": "0", "stdev": "1" }, { "name": "Var1201", "index": "4", "type": "numeric", "distinct": "4", "missing": "0", "min": "0", "max": "98", "mean": "0", "stdev": "3" }, { "name": "Var1344", "index": "5", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "2", "mean": "0", "stdev": "0" }, { "name": "Var1720", "index": "6", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var1908", "index": "7", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var1985", "index": "8", "type": "numeric", "distinct": "16", "missing": "1984", "min": "3559", "max": "68948", "mean": "19185", "stdev": "18089" }, { "name": "Var2072", "index": "9", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "9", "mean": "0", "stdev": "0" }, { "name": "Var2455", "index": "10", "type": "numeric", "distinct": "10", "missing": "1947", "min": "0", "max": "78", "mean": "11", "stdev": "19" }, { "name": "Var2548", "index": "11", "type": "numeric", "distinct": "7", "missing": "0", "min": "0", "max": "44", "mean": "0", "stdev": "1" }, { "name": "Var2592", "index": "12", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "15", "mean": "0", "stdev": "0" }, { "name": "Var2643", "index": "13", "type": "numeric", "distinct": "3", "missing": "0", "min": "0", "max": "108", "mean": "0", "stdev": "2" }, { "name": "Var2681", "index": "14", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var2846", "index": "15", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var2990", "index": "16", "type": "numeric", "distinct": "14", "missing": "0", "min": "0", "max": "301", "mean": "1", "stdev": "9" }, { "name": "Var3090", "index": "17", "type": "numeric", "distinct": "6", "missing": "0", "min": "0", "max": "207", "mean": "0", "stdev": "9" }, { "name": "Var3235", "index": "18", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "8", "mean": "0", "stdev": "1" }, { "name": "Var3262", "index": "19", "type": "numeric", "distinct": "78", "missing": "0", "min": "0", "max": "275", "mean": "2", "stdev": "15" }, { "name": "Var3303", "index": "20", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "14", "mean": "0", "stdev": "0" }, { "name": "Var3332", "index": "21", "type": "numeric", "distinct": "3", "missing": "0", "min": "0", "max": "135", "mean": "0", "stdev": "3" }, { "name": "Var3990", "index": "22", "type": "numeric", "distinct": "3", "missing": "0", "min": "0", "max": "80", "mean": "0", "stdev": "2" }, { "name": "Var4201", "index": "23", "type": "numeric", "distinct": "12", "missing": "0", "min": "0", "max": "200", "mean": "1", "stdev": "12" }, { "name": "Var4519", "index": "24", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var5022", "index": "25", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "8", "mean": "0", "stdev": "0" }, { "name": "Var5210", "index": "26", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var5399", "index": "27", "type": "numeric", "distinct": "1529", "missing": "0", "min": "0", "max": "198045600", "mean": "4207543", "stdev": "15769028" }, { "name": "Var5420", "index": "28", "type": "numeric", "distinct": "5", "missing": "0", "min": "0", "max": "40", "mean": "0", "stdev": "2" }, { "name": "Var5482", "index": "29", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "7", "mean": "0", "stdev": "0" }, { "name": "Var5495", "index": "30", "type": "numeric", "distinct": "88", "missing": "0", "min": "0", "max": "23408", "mean": "119", "stdev": "1082" }, { "name": "Var5588", "index": "31", "type": "numeric", "distinct": "3", "missing": "0", "min": "0", "max": "6", "mean": "0", "stdev": "0" }, { "name": "Var5707", "index": "32", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var6070", "index": "33", "type": "numeric", "distinct": "13", "missing": "0", "min": "0", "max": "125", "mean": "1", "stdev": "7" }, { "name": "Var6394", "index": "34", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "108", "mean": "0", "stdev": "2" }, { "name": "Var6728", "index": "35", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "4", "mean": "0", "stdev": "0" }, { "name": "Var6859", "index": "36", "type": "numeric", "distinct": "7", "missing": "0", "min": "0", "max": "48", "mean": "0", "stdev": "2" }, { "name": "Var6897", "index": "37", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "4", "mean": "0", "stdev": "0" }, { "name": "Var7086", "index": "38", "type": "numeric", "distinct": "3", "missing": "0", "min": "0", "max": "216", "mean": "0", "stdev": "7" }, { "name": "Var7095", "index": "39", "type": "numeric", "distinct": "42", "missing": "0", "min": "0", "max": "1315", "mean": "14", "stdev": "51" }, { "name": "Var7145", "index": "40", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "6", "mean": "0", "stdev": "0" }, { "name": "Var7329", "index": "41", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var7346", "index": "42", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "6", "mean": "0", "stdev": "0" }, { "name": "Var7408", "index": "43", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var7418", "index": "44", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "6", "mean": "0", "stdev": "0" }, { "name": "Var7423", "index": "45", "type": "numeric", "distinct": "3", "missing": "0", "min": "0", "max": "84", "mean": "0", "stdev": "3" }, { "name": "Var7445", "index": "46", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var7585", "index": "47", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "6", "mean": "0", "stdev": "0" }, { "name": "Var7755", "index": "48", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var7774", "index": "49", "type": "numeric", "distinct": "4", "missing": "0", "min": "0", "max": "154", "mean": "0", "stdev": "5" }, { "name": "Var7959", "index": "50", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "6", "mean": "0", "stdev": "1" }, { "name": "Var8026", "index": "51", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var8077", "index": "52", "type": "numeric", "distinct": "1680", "missing": "0", "min": "0", "max": "217438900", "mean": "4105145", "stdev": "14168141" }, { "name": "Var8523", "index": "53", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "8", "mean": "0", "stdev": "0" }, { "name": "Var8575", "index": "54", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "7", "mean": "0", "stdev": "1" }, { "name": "Var8621", "index": "55", "type": "numeric", "distinct": "306", "missing": "0", "min": "0", "max": "4102", "mean": "115", "stdev": "264" }, { "name": "Var8669", "index": "56", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "6", "mean": "0", "stdev": "1" }, { "name": "Var8760", "index": "57", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "6", "mean": "0", "stdev": "1" }, { "name": "Var9010", "index": "58", "type": "numeric", "distinct": "20", "missing": "0", "min": "0", "max": "180", "mean": "29", "stdev": "27" }, { "name": "Var9050", "index": "59", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var9303", "index": "60", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var9546", "index": "61", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "3", "mean": "0", "stdev": "1" }, { "name": "Var9863", "index": "62", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var9898", "index": "63", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "8", "mean": "0", "stdev": "1" }, { "name": "Var10032", "index": "64", "type": "numeric", "distinct": "65", "missing": "0", "min": "0", "max": "2016", "mean": "29", "stdev": "81" }, { "name": "Var10070", "index": "65", "type": "numeric", "distinct": "624", "missing": "0", "min": "0", "max": "1755072", "mean": "32479", "stdev": "88629" }, { "name": "Var10323", "index": "66", "type": "numeric", "distinct": "7", "missing": "0", "min": "0", "max": "42", "mean": "3", "stdev": "9" }, { "name": "Var10478", "index": "67", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "2", "mean": "0", "stdev": "0" }, { "name": "Var10768", "index": "68", "type": "numeric", "distinct": "20", "missing": "0", "min": "0", "max": "273", "mean": "4", "stdev": "15" }, { "name": "Var10885", "index": "69", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "8", "mean": "0", "stdev": "0" }, { "name": "Var11489", "index": "70", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var11719", "index": "71", "type": "numeric", "distinct": "9", "missing": "0", "min": "0", "max": "108", "mean": "1", "stdev": "5" }, { "name": "Var11902", "index": "72", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var12004", "index": "73", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "5", "mean": "0", "stdev": "0" }, { "name": "Var12073", "index": "74", "type": "numeric", "distinct": "4", "missing": "0", "min": "0", "max": "77", "mean": "0", "stdev": "2" }, { "name": "Var12731", "index": "75", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var12940", "index": "76", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "4", "mean": "0", "stdev": "0" }, { "name": "Var13071", "index": "77", "type": "numeric", "distinct": "14", "missing": "0", "min": "0", "max": "28", "mean": "0", "stdev": "2" }, { "name": "Var13155", "index": "78", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var13197", "index": "79", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var13333", "index": "80", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var13391", "index": "81", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "78", "mean": "0", "stdev": "2" }, { "name": "Var13399", "index": "82", "type": "numeric", "distinct": "12", "missing": "0", "min": "0", "max": "100", "mean": "1", "stdev": "7" }, { "name": "Var13809", "index": "83", "type": "numeric", "distinct": "38", "missing": "1936", "min": "75", "max": "259", "mean": "159", "stdev": "63" }, { "name": "Var13817", "index": "84", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var13855", "index": "85", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "6", "mean": "0", "stdev": "0" }, { "name": "Var13950", "index": "86", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "8", "mean": "0", "stdev": "0" }, { "name": "Var13986", "index": "87", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var13988", "index": "88", "type": "numeric", "distinct": "71", "missing": "0", "min": "0", "max": "7155", "mean": "68", "stdev": "229" }, { "name": "Var14022", "index": "89", "type": "numeric", "distinct": "10", "missing": "0", "min": "0", "max": "108", "mean": "2", "stdev": "8" }, { "name": "Var14270", "index": "90", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var14332", "index": "91", "type": "numeric", "distinct": "5", "missing": "0", "min": "0", "max": "28", "mean": "1", "stdev": "3" }, { "name": "Var14367", "index": "92", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var14397", "index": "93", "type": "numeric", "distinct": "16", "missing": "0", "min": "0", "max": "192", "mean": "1", "stdev": "15" }, { "name": "Var14403", "index": "94", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var14484", "index": "95", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var14530", "index": "96", "type": "numeric", "distinct": "1", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "Var14625", "index": "97", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "9", "mean": "0", "stdev": "1" }, { "name": "Var14882", "index": "98", "type": "nominal", "distinct": "5", "missing": "1872", "distr": [ [ "DYL8R2iBYrMgPXGW", "GDe8R2qCGrMgPUqJ", "cIC70X8", "jCrv2_0FcX", "rG9TzSEGqD" ], [ [ "53", "1" ], [ "52", "1" ], [ "1", "0" ], [ "18", "0" ], [ "2", "0" ] ] ] }, { "name": "Var14896", "index": "99", "type": "nominal", "distinct": "14", "missing": "1687", "distr": [ [ "3bWYgVeTHBV0RTOU", "7zIGlOXONU", "DM7vWc8Fqk", "PrhJBtZCqM3tnGi2vLU", "QHpf7_5nmI", "jiE2mH8ervrra", "mLLNZZ6XwVCebViY", "mPR1Z2CeZHDde0vb16PMiZ", "rCt_hNS2iHndRyM_", "rCt_hNS2iHndTTki", "rCt_hNS2ia0G3Ry3", "rCt_hNS2ia0GRJNE", "rCt_hNS2ia0GaQ_B", "rCt_hNS2ia0GukF3", "rCt_hNS2ia0GwGJv", "rmfkwXnOv5ub9", "sAaMkwB", "xUFCEWs", "yzZQoa70PA" ], [ [ "3", "0" ], [ "2", "0" ], [ "0", "0" ], [ "4", "1" ], [ "4", "1" ], [ "0", "0" ], [ "2", "1" ], [ "14", "0" ], [ "0", "0" ], [ "0", "0" ], [ "2", "0" ], [ "8", "1" ], [ "5", "1" ], [ "2", "0" ], [ "2", "0" ], [ "0", "0" ], [ "3", "0" ], [ "226", "19" ], [ "11", "1" ] ] ] } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }