导入支持库
In [1020]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from pandas import Series, DataFrame
#在GUI中显示
# %matplotlib tk
#在行内显示(默认)
%matplotlib inline
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False载入数据 合并数据 data_obj
In [226]:
users_df = pd.read_csv('./data/users.dat', sep='::', engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
movies_df = pd.read_csv('./data/movies.dat', sep='::', engine='python', names=['MovieID', 'Title', 'Genres'])
ratings_df = pd.read_csv('./data/ratings.dat', sep='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
# print(users_df)
# print(movies_df)
# print(ratings_df)
combine_movies_ratings_df = pd.merge(movies_df, ratings_df, on='MovieID')
combine_movies_ratings_users_df = pd.merge(combine_movies_ratings_df, users_df, on='UserID')
data_obj = combine_movies_ratings_users_df
data_objOut[226]:
| MovieID | Title | Genres | UserID | Rating | Timestamp | Gender | Age | Occupation | Zip-code | |||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Toy Story (1995) | Animation\ | Children's\ | Comedy | 1 | 5 | 978824268 | F | 1 | 10 | 48067 | ||
| 1 | 48 | Pocahontas (1995) | Animation\ | Children's\ | Musical\ | Romance | 1 | 5 | 978824351 | F | 1 | 10 | 48067 | |
| 2 | 150 | Apollo 13 (1995) | Drama | 1 | 5 | 978301777 | F | 1 | 10 | 48067 | ||||
| 3 | 260 | Star Wars: Episode IV - A New Hope (1977) | Action\ | Adventure\ | Fantasy\ | Sci-Fi | 1 | 4 | 978300760 | F | 1 | 10 | 48067 | |
| 4 | 527 | Schindler's List (1993) | Drama\ | War | 1 | 5 | 978824195 | F | 1 | 10 | 48067 | |||
| 5 | 531 | Secret Garden, The (1993) | Children's\ | Drama | 1 | 4 | 978302149 | F | 1 | 10 | 48067 | |||
| 6 | 588 | Aladdin (1992) | Animation\ | Children's\ | Comedy\ | Musical | 1 | 4 | 978824268 | F | 1 | 10 | 48067 | |
| 7 | 594 | Snow White and the Seven Dwarfs (1937) | Animation\ | Children's\ | Musical | 1 | 4 | 978302268 | F | 1 | 10 | 48067 | ||
| 8 | 595 | Beauty and the Beast (1991) | Animation\ | Children's\ | Musical | 1 | 5 | 978824268 | F | 1 | 10 | 48067 | ||
| 9 | 608 | Fargo (1996) | Crime\ | Drama\ | Thriller | 1 | 4 | 978301398 | F | 1 | 10 | 48067 | ||
| 10 | 661 | James and the Giant Peach (1996) | Animation\ | Children's\ | Musical | 1 | 3 | 978302109 | F | 1 | 10 | 48067 | ||
| 11 | 720 | Wallace & Gromit: The Best of Aardman Animatio... | Animation | 1 | 3 | 978300760 | F | 1 | 10 | 48067 | ||||
| 12 | 745 | Close Shave, A (1995) | Animation\ | Comedy\ | Thriller | 1 | 3 | 978824268 | F | 1 | 10 | 48067 | ||
| 13 | 783 | Hunchback of Notre Dame, The (1996) | Animation\ | Children's\ | Musical | 1 | 4 | 978824291 | F | 1 | 10 | 48067 | ||
| 14 | 914 | My Fair Lady (1964) | Musical\ | Romance | 1 | 3 | 978301968 | F | 1 | 10 | 48067 | |||
| 15 | 919 | Wizard of Oz, The (1939) | Adventure\ | Children's\ | Drama\ | Musical | 1 | 4 | 978301368 | F | 1 | 10 | 48067 | |
| 16 | 938 | Gigi (1958) | Musical | 1 | 4 | 978301752 | F | 1 | 10 | 48067 | ||||
| 17 | 1022 | Cinderella (1950) | Animation\ | Children's\ | Musical | 1 | 5 | 978300055 | F | 1 | 10 | 48067 | ||
| 18 | 1028 | Mary Poppins (1964) | Children's\ | Comedy\ | Musical | 1 | 5 | 978301777 | F | 1 | 10 | 48067 | ||
| 19 | 1029 | Dumbo (1941) | Animation\ | Children's\ | Musical | 1 | 5 | 978302205 | F | 1 | 10 | 48067 | ||
| 20 | 1035 | Sound of Music, The (1965) | Musical | 1 | 5 | 978301753 | F | 1 | 10 | 48067 | ||||
| 21 | 1097 | E.T. the Extra-Terrestrial (1982) | Children's\ | Drama\ | Fantasy\ | Sci-Fi | 1 | 4 | 978301953 | F | 1 | 10 | 48067 | |
| 22 | 1193 | One Flew Over the Cuckoo's Nest (1975) | Drama | 1 | 5 | 978300760 | F | 1 | 10 | 48067 | ||||
| 23 | 1197 | Princess Bride, The (1987) | Action\ | Adventure\ | Comedy\ | Romance | 1 | 3 | 978302268 | F | 1 | 10 | 48067 | |
| 24 | 1207 | To Kill a Mockingbird (1962) | Drama | 1 | 4 | 978300719 | F | 1 | 10 | 48067 | ||||
| 25 | 1246 | Dead Poets Society (1989) | Drama | 1 | 4 | 978302091 | F | 1 | 10 | 48067 | ||||
| 26 | 1270 | Back to the Future (1985) | Comedy\ | Sci-Fi | 1 | 5 | 978300055 | F | 1 | 10 | 48067 | |||
| 27 | 1287 | Ben-Hur (1959) | Action\ | Adventure\ | Drama | 1 | 5 | 978302039 | F | 1 | 10 | 48067 | ||
| 28 | 1545 | Ponette (1996) | Drama | 1 | 4 | 978824139 | F | 1 | 10 | 48067 | ||||
| 29 | 1566 | Hercules (1997) | Adventure\ | Animation\ | Children's\ | Comedy\ | Musical | 1 | 4 | 978824330 | F | 1 | 10 | 48067 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ||||
| 1000179 | 2763 | Thomas Crown Affair, The (1999) | Action\ | Thriller | 5727 | 5 | 958492554 | M | 25 | 4 | 92843 | |||
| 1000180 | 2770 | Bowfinger (1999) | Comedy | 5727 | 4 | 958490923 | M | 25 | 4 | 92843 | ||||
| 1000181 | 2805 | Mickey Blue Eyes (1999) | Comedy\ | Romance | 5727 | 4 | 958492371 | M | 25 | 4 | 92843 | |||
| 1000182 | 2827 | Astronaut's Wife, The (1999) | Sci-Fi\ | Thriller | 5727 | 2 | 958490886 | M | 25 | 4 | 92843 | |||
| 1000183 | 2840 | Stigmata (1999) | Thriller | 5727 | 3 | 958492512 | M | 25 | 4 | 92843 | ||||
| 1000184 | 2841 | Stir of Echoes (1999) | Thriller | 5727 | 2 | 958492512 | M | 25 | 4 | 92843 | ||||
| 1000185 | 2858 | American Beauty (1999) | Comedy\ | Drama | 5727 | 5 | 958490857 | M | 25 | 4 | 92843 | |||
| 1000186 | 2881 | Double Jeopardy (1999) | Action\ | Thriller | 5727 | 3 | 958490981 | M | 25 | 4 | 92843 | |||
| 1000187 | 2907 | Superstar (1999) | Comedy | 5727 | 1 | 958492554 | M | 25 | 4 | 92843 | ||||
| 1000188 | 3005 | Bone Collector, The (1999) | Thriller | 5727 | 2 | 958490923 | M | 25 | 4 | 92843 | ||||
| 1000189 | 3053 | Messenger: The Story of Joan of Arc, The (1999) | Drama\ | War | 5727 | 3 | 958492345 | M | 25 | 4 | 92843 | |||
| 1000190 | 3081 | Sleepy Hollow (1999) | Horror\ | Romance | 5727 | 4 | 958492512 | M | 25 | 4 | 92843 | |||
| 1000191 | 3113 | End of Days (1999) | Action\ | Thriller | 5727 | 3 | 958490981 | M | 25 | 4 | 92843 | |||
| 1000192 | 3114 | Toy Story 2 (1999) | Animation\ | Children's\ | Comedy | 5727 | 5 | 958492554 | M | 25 | 4 | 92843 | ||
| 1000193 | 3176 | Talented Mr. Ripley, The (1999) | Drama\ | Mystery\ | Thriller | 5727 | 3 | 958492554 | M | 25 | 4 | 92843 | ||
| 1000194 | 3177 | Next Friday (1999) | Comedy | 5727 | 1 | 958489748 | M | 25 | 4 | 92843 | ||||
| 1000195 | 3273 | Scream 3 (2000) | Horror\ | Mystery\ | Thriller | 5727 | 4 | 958492479 | M | 25 | 4 | 92843 | ||
| 1000196 | 3301 | Whole Nine Yards, The (2000) | Comedy\ | Crime | 5727 | 5 | 958490142 | M | 25 | 4 | 92843 | |||
| 1000197 | 3316 | Reindeer Games (2000) | Action\ | Thriller | 5727 | 3 | 958490699 | M | 25 | 4 | 92843 | |||
| 1000198 | 3317 | Wonder Boys (2000) | Comedy\ | Drama | 5727 | 4 | 958489879 | M | 25 | 4 | 92843 | |||
| 1000199 | 3408 | Erin Brockovich (2000) | Drama | 5727 | 5 | 958489879 | M | 25 | 4 | 92843 | ||||
| 1000200 | 3409 | Final Destination (2000) | Drama\ | Thriller | 5727 | 4 | 958490143 | M | 25 | 4 | 92843 | |||
| 1000201 | 3481 | High Fidelity (2000) | Comedy | 5727 | 4 | 958489879 | M | 25 | 4 | 92843 | ||||
| 1000202 | 3483 | Road to El Dorado, The (2000) | Animation\ | Children's | 5727 | 3 | 958490143 | M | 25 | 4 | 92843 | |||
| 1000203 | 3484 | Skulls, The (2000) | Thriller | 5727 | 1 | 958489902 | M | 25 | 4 | 92843 | ||||
| 1000204 | 3513 | Rules of Engagement (2000) | Drama\ | Thriller | 5727 | 4 | 958489970 | M | 25 | 4 | 92843 | |||
| 1000205 | 3535 | American Psycho (2000) | Comedy\ | Horror\ | Thriller | 5727 | 2 | 958489970 | M | 25 | 4 | 92843 | ||
| 1000206 | 3536 | Keeping the Faith (2000) | Comedy\ | Romance | 5727 | 5 | 958489902 | M | 25 | 4 | 92843 | |||
| 1000207 | 3555 | U-571 (2000) | Action\ | Thriller | 5727 | 3 | 958490699 | M | 25 | 4 | 92843 | |||
| 1000208 | 3578 | Gladiator (2000) | Action\ | Drama | 5727 | 5 | 958490171 | M | 25 | 4 | 92843 |
1000209 rows × 10 columns
1.查看每部电影中两类性别的平均分
In [79]:
# a_movie = data_obj.groupby(['MovieID', 'Gender']).mean()
# out = DataFrame(a_movie['Rating'])
# out
out = DataFrame.pivot_table(data_obj, values='Rating', index='MovieID', columns='Gender', aggfunc='mean')
outOut[79]:
| Gender | F | M |
|---|---|---|
| MovieID | ||
| 1 | 4.444444 | 4.285714 |
| 2 | 3.666667 | 3.500000 |
| 3 | 2.500000 | 2.000000 |
| 4 | NaN | 3.000000 |
| 5 | NaN | 4.000000 |
| 6 | 3.000000 | 4.000000 |
| 7 | 2.750000 | 3.500000 |
| 9 | NaN | 5.000000 |
| 10 | 5.000000 | 3.714286 |
| 11 | 4.000000 | 3.000000 |
| 13 | 3.000000 | 4.000000 |
| 14 | 5.000000 | 4.000000 |
| 15 | NaN | 2.000000 |
| 16 | 4.000000 | 3.777778 |
| 17 | 4.285714 | 3.900000 |
| 18 | NaN | 4.000000 |
| 19 | NaN | 2.000000 |
| 21 | 4.000000 | 3.166667 |
| 22 | NaN | 4.000000 |
| 23 | 3.000000 | NaN |
| 24 | 3.000000 | 3.333333 |
| 25 | 3.833333 | 4.500000 |
| 26 | 4.000000 | NaN |
| 28 | 3.000000 | 3.000000 |
| 29 | 3.000000 | 4.500000 |
| 30 | NaN | 4.500000 |
| 32 | 4.000000 | 3.916667 |
| 34 | 3.666667 | 3.866667 |
| 35 | 2.000000 | 5.000000 |
| 36 | 3.750000 | 3.714286 |
| ... | ... | ... |
| 953 | 4.750000 | 4.800000 |
| 954 | 4.750000 | 4.000000 |
| 955 | 4.000000 | 5.000000 |
| 956 | 3.000000 | NaN |
| 957 | NaN | 4.000000 |
| 959 | 4.500000 | NaN |
| 960 | 4.000000 | NaN |
| 961 | 4.000000 | 4.000000 |
| 963 | 5.000000 | 5.000000 |
| 965 | 3.500000 | 4.000000 |
| 968 | NaN | 3.714286 |
| 969 | 4.200000 | 4.181818 |
| 971 | 4.500000 | 4.333333 |
| 973 | 5.000000 | NaN |
| 974 | 3.000000 | NaN |
| 976 | 3.000000 | 4.000000 |
| 978 | 3.000000 | NaN |
| 982 | NaN | 4.000000 |
| 986 | 1.000000 | 5.000000 |
| 991 | NaN | 3.000000 |
| 994 | 5.000000 | 4.000000 |
| 996 | NaN | 3.000000 |
| 999 | NaN | 4.000000 |
| 1003 | NaN | 2.000000 |
| 1005 | 1.000000 | 2.000000 |
| 1007 | 3.000000 | 2.000000 |
| 1009 | 5.000000 | 2.750000 |
| 1010 | NaN | 3.000000 |
| 1011 | NaN | 2.000000 |
| 1012 | 3.000000 | 3.000000 |
554 rows × 2 columns
2.查看每部电影中两类性别的均分差异最大的前3的电影
In [113]:
noNaN = out.dropna()
noNaN = noNaN.copy()
noNaN['|F-M|'] = abs(noNaN['F'] - noNaN['M'])
noNaN.sort_values("|F-M|",ascending=False)
noNaN.head(3)Out[113]:
| Gender | F | M | \ | F-M\ | |
|---|---|---|---|---|---|
| MovieID | |||||
| 1 | 4.444444 | 4.285714 | 0.158730 | ||
| 2 | 3.666667 | 3.500000 | 0.166667 | ||
| 3 | 2.500000 | 2.000000 | 0.500000 |
3.查找出热门电影热度榜单 (计算表中电影出现的次数,取前3)
In [275]:
ratings_df.groupby('MovieID').count().sort_values('UserID', ascending=False).head(3)Out[275]:
| UserID | Rating | Timestamp | |
|---|---|---|---|
| MovieID | |||
| 2858 | 3428 | 3428 | 3428 |
| 260 | 2991 | 2991 | 2991 |
| 1196 | 2990 | 2990 | 2990 |
4.每部电影的评分(平均分) 获取排名前10的高分榜单
In [141]:
DataFrame(ratings_df.groupby('MovieID').mean().sort_values('Rating', ascending=False)['Rating'].head(10))Out[141]:
| Rating | |
|---|---|
| MovieID | |
| 1978 | 5.0 |
| 1611 | 5.0 |
| 1963 | 5.0 |
| 1950 | 5.0 |
| 1939 | 5.0 |
| 1937 | 5.0 |
| 1827 | 5.0 |
| 1821 | 5.0 |
| 1769 | 5.0 |
| 1563 | 5.0 |
5.前十大热门电影__的评分
In [274]:
hot = ratings_df.groupby('MovieID').count().sort_values('UserID', ascending=False)['Rating']
DataFrame(hot.head(10))Out[274]:
| Rating | |
|---|---|
| MovieID | |
| 2858 | 3428 |
| 260 | 2991 |
| 1196 | 2990 |
| 1210 | 2883 |
| 480 | 2672 |
| 2028 | 2653 |
| 589 | 2649 |
| 2571 | 2590 |
| 1270 | 2583 |
| 593 | 2578 |
6.前20名高分电影的__热度 (使用第3.热度表)
In [276]:
height20 = ratings_df.groupby('MovieID').mean().sort_values('Rating', ascending=False)['Rating']
DataFrame(height20.head(20))Out[276]:
| Rating | |
|---|---|
| MovieID | |
| 989 | 5.000000 |
| 3881 | 5.000000 |
| 1830 | 5.000000 |
| 3382 | 5.000000 |
| 787 | 5.000000 |
| 3280 | 5.000000 |
| 3607 | 5.000000 |
| 3233 | 5.000000 |
| 3172 | 5.000000 |
| 3656 | 5.000000 |
| 3245 | 4.800000 |
| 53 | 4.750000 |
| 2503 | 4.666667 |
| 2905 | 4.608696 |
| 2019 | 4.560510 |
| 318 | 4.554558 |
| 858 | 4.524966 |
| 745 | 4.520548 |
| 50 | 4.517106 |
| 527 | 4.510417 |
7.查询出前十好电影
好电影:
1.看的人多(点击在1000以上)
2.分数还高(按Rating降序)
In [277]:
DataFrame(height20[hot[hot>1000].head(20).index].sort_values(ascending=False))Out[277]:
| Rating | |
|---|---|
| MovieID | |
| 527 | 4.510417 |
| 1198 | 4.477725 |
| 260 | 4.453694 |
| 2762 | 4.406263 |
| 593 | 4.351823 |
| 2028 | 4.337354 |
| 2858 | 4.317386 |
| 2571 | 4.315830 |
| 1197 | 4.303710 |
| 1196 | 4.292977 |
| 608 | 4.254676 |
| 110 | 4.234957 |
| 1617 | 4.219406 |
| 2396 | 4.127480 |
| 589 | 4.058513 |
| 1210 | 4.022893 |
| 1270 | 3.990321 |
| 1265 | 3.953029 |
| 480 | 3.763847 |
| 1580 | 3.739953 |
使用的数据集是美国人口普查的数据
In [286]:
census_df = pd.read_csv('./data/census.csv', engine='python')
census_dfOut[286]:
| SUMLEV | REGION | DIVISION | STATE | COUNTY | STNAME | CTYNAME | CENSUS2010POP | ESTIMATESBASE2010 | POPESTIMATE2010 | ... | RDOMESTICMIG2011 | RDOMESTICMIG2012 | RDOMESTICMIG2013 | RDOMESTICMIG2014 | RDOMESTICMIG2015 | RNETMIG2011 | RNETMIG2012 | RNETMIG2013 | RNETMIG2014 | RNETMIG2015 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40 | 3 | 6 | 1 | 0 | Alabama | Alabama | 4779736 | 4780127 | 4785161 | ... | 0.002295 | -0.193196 | 0.381066 | 0.582002 | -0.467369 | 1.030015 | 0.826644 | 1.383282 | 1.724718 | 0.712594 |
| 1 | 50 | 3 | 6 | 1 | 1 | Alabama | Autauga County | 54571 | 54571 | 54660 | ... | 7.242091 | -2.915927 | -3.012349 | 2.265971 | -2.530799 | 7.606016 | -2.626146 | -2.722002 | 2.592270 | -2.187333 |
| 2 | 50 | 3 | 6 | 1 | 3 | Alabama | Baldwin County | 182265 | 182265 | 183193 | ... | 14.832960 | 17.647293 | 21.845705 | 19.243287 | 17.197872 | 15.844176 | 18.559627 | 22.727626 | 20.317142 | 18.293499 |
| 3 | 50 | 3 | 6 | 1 | 5 | Alabama | Barbour County | 27457 | 27457 | 27341 | ... | -4.728132 | -2.500690 | -7.056824 | -3.904217 | -10.543299 | -4.874741 | -2.758113 | -7.167664 | -3.978583 | -10.543299 |
| 4 | 50 | 3 | 6 | 1 | 7 | Alabama | Bibb County | 22915 | 22919 | 22861 | ... | -5.527043 | -5.068871 | -6.201001 | -0.177537 | 0.177258 | -5.088389 | -4.363636 | -5.403729 | 0.754533 | 1.107861 |
| 5 | 50 | 3 | 6 | 1 | 9 | Alabama | Blount County | 57322 | 57322 | 57373 | ... | 1.807375 | -1.177622 | -1.748766 | -2.062535 | -1.369970 | 1.859511 | -0.848580 | -1.402476 | -1.577232 | -0.884411 |
| 6 | 50 | 3 | 6 | 1 | 11 | Alabama | Bullock County | 10914 | 10915 | 10887 | ... | -30.953709 | -5.180127 | -1.130263 | 14.354290 | -16.167247 | -29.001673 | -2.825524 | 1.507017 | 17.243790 | -13.193961 |
| 7 | 50 | 3 | 6 | 1 | 13 | Alabama | Butler County | 20947 | 20946 | 20944 | ... | -14.032727 | -11.684234 | -5.655413 | 1.085428 | -6.529805 | -13.936612 | -11.586865 | -5.557058 | 1.184103 | -6.430868 |
| 8 | 50 | 3 | 6 | 1 | 15 | Alabama | Calhoun County | 118572 | 118586 | 118437 | ... | -6.155670 | -4.611706 | -5.524649 | -4.463211 | -3.376322 | -5.791579 | -4.092677 | -5.062836 | -3.912834 | -2.806406 |
| 9 | 50 | 3 | 6 | 1 | 17 | Alabama | Chambers County | 34215 | 34170 | 34098 | ... | -2.731639 | 3.849092 | 2.872721 | -2.287222 | 1.349468 | -1.821092 | 4.701181 | 3.781439 | -1.290228 | 2.346901 |
| 10 | 50 | 3 | 6 | 1 | 19 | Alabama | Cherokee County | 25989 | 25986 | 25976 | ... | 6.339327 | 1.113180 | 5.488706 | -0.076806 | -3.239866 | 6.416167 | 1.420264 | 5.757384 | 0.230419 | -2.931307 |
| 11 | 50 | 3 | 6 | 1 | 21 | Alabama | Chilton County | 43643 | 43631 | 43665 | ... | -1.372935 | -2.653369 | 0.480044 | 0.456017 | -2.253483 | -0.823761 | -2.447504 | 0.868651 | 0.957636 | -1.752709 |
| 12 | 50 | 3 | 6 | 1 | 23 | Alabama | Choctaw County | 13859 | 13858 | 13841 | ... | -15.455274 | -0.737028 | -8.766391 | -1.274984 | -5.291205 | -15.528177 | -0.737028 | -8.766391 | -1.274984 | -5.291205 |
| 13 | 50 | 3 | 6 | 1 | 25 | Alabama | Clarke County | 25833 | 25840 | 25767 | ... | -6.194363 | -17.667705 | -0.318345 | -8.686428 | -5.613667 | -6.077488 | -17.509958 | -0.159172 | -8.486280 | -5.411736 |
| 14 | 50 | 3 | 6 | 1 | 27 | Alabama | Clay County | 13932 | 13932 | 13880 | ... | -10.744102 | -13.345130 | 4.902871 | 5.702648 | 3.912450 | -10.816697 | -13.345130 | 4.977157 | 5.776708 | 3.986270 |
| 15 | 50 | 3 | 6 | 1 | 29 | Alabama | Cleburne County | 14972 | 14972 | 14973 | ... | -3.673524 | -5.151880 | 7.345821 | 3.654485 | -3.123961 | -3.673524 | -5.151880 | 7.345821 | 3.654485 | -3.123961 |
| 16 | 50 | 3 | 6 | 1 | 31 | Alabama | Coffee County | 49948 | 49948 | 50177 | ... | 0.377640 | 7.675579 | -13.146535 | -3.602859 | 2.214774 | 2.166460 | 11.513368 | -10.438741 | -0.767822 | 5.350738 |
| 17 | 50 | 3 | 6 | 1 | 33 | Alabama | Colbert County | 54428 | 54428 | 54514 | ... | -0.073423 | 1.065051 | 1.762390 | 1.835688 | -0.110260 | 0.513964 | 1.469035 | 2.276420 | 2.533249 | 0.588052 |
| 18 | 50 | 3 | 6 | 1 | 35 | Alabama | Conecuh County | 13228 | 13228 | 13208 | ... | -4.861559 | -7.504690 | -6.107224 | -14.645416 | 2.684140 | -4.861559 | -7.504690 | -6.107224 | -14.645416 | 2.684140 |
| 19 | 50 | 3 | 6 | 1 | 37 | Alabama | Coosa County | 11539 | 11758 | 11758 | ... | -33.930581 | -10.291443 | -4.313831 | -22.958017 | -5.387581 | -34.017138 | -10.380162 | -4.403703 | -23.049483 | -5.387581 |
| 20 | 50 | 3 | 6 | 1 | 39 | Alabama | Covington County | 37765 | 37765 | 37796 | ... | 6.696899 | -4.612668 | 0.740271 | 3.697932 | -0.316945 | 6.881460 | -4.559952 | 0.793147 | 3.750759 | -0.264121 |
| 21 | 50 | 3 | 6 | 1 | 41 | Alabama | Crenshaw County | 13906 | 13906 | 13853 | ... | 1.729792 | 3.950156 | -1.864936 | 3.084648 | 3.439504 | 2.666763 | 5.099293 | -0.502098 | 4.734577 | 5.087600 |
| 22 | 50 | 3 | 6 | 1 | 43 | Alabama | Cullman County | 80406 | 80410 | 80473 | ... | -1.404233 | -1.019628 | 4.071247 | 5.087142 | 7.915406 | -1.031427 | -0.634159 | 4.542916 | 5.593387 | 8.417777 |
| 23 | 50 | 3 | 6 | 1 | 45 | Alabama | Dale County | 50251 | 50251 | 50358 | ... | -10.749798 | -5.277150 | -15.236079 | -11.979785 | -5.107706 | -9.575283 | -0.776637 | -12.640155 | -9.503292 | -1.998668 |
| 24 | 50 | 3 | 6 | 1 | 47 | Alabama | Dallas County | 43820 | 43820 | 43803 | ... | -15.635599 | -11.308243 | -16.745678 | -9.344789 | -14.687232 | -15.727573 | -11.378047 | -16.792849 | -9.368689 | -14.711389 |
| 25 | 50 | 3 | 6 | 1 | 49 | Alabama | DeKalb County | 71109 | 71115 | 71142 | ... | 0.294677 | -9.302391 | -1.748807 | 0.267830 | 0.028141 | 1.375159 | -8.656001 | -1.029539 | 1.198187 | 0.956790 |
| 26 | 50 | 3 | 6 | 1 | 51 | Alabama | Elmore County | 79303 | 79296 | 79465 | ... | 3.235576 | 0.822717 | 1.760531 | -1.507057 | 2.067820 | 3.674511 | 1.558176 | 2.306047 | -0.951175 | 2.757093 |
| 27 | 50 | 3 | 6 | 1 | 53 | Alabama | Escambia County | 38319 | 38319 | 38309 | ... | -3.449988 | -3.855889 | -4.822706 | -1.189831 | 1.190902 | -3.397716 | -3.803428 | -4.769999 | -1.136950 | 1.243830 |
| 28 | 50 | 3 | 6 | 1 | 55 | Alabama | Etowah County | 104430 | 104427 | 104442 | ... | -1.015919 | 2.062637 | -1.931884 | -1.726932 | -2.082234 | -0.632554 | 2.446383 | -1.518596 | -1.234901 | -1.588308 |
| 29 | 50 | 3 | 6 | 1 | 57 | Alabama | Fayette County | 17241 | 17241 | 17231 | ... | -5.015601 | -0.646640 | -3.725937 | 0.296745 | -2.797536 | -5.132243 | -0.705426 | -3.785079 | 0.237396 | -2.857058 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3163 | 50 | 2 | 3 | 55 | 131 | Wisconsin | Washington County | 131887 | 131885 | 131967 | ... | -0.794876 | 0.785279 | -2.215465 | 1.601149 | -0.434498 | -0.431504 | 1.162817 | -1.763330 | 2.104796 | 0.059931 |
| 3164 | 50 | 2 | 3 | 55 | 133 | Wisconsin | Waukesha County | 389891 | 389938 | 390076 | ... | -0.765799 | 2.128860 | 0.038132 | 0.760109 | -0.719858 | 0.102448 | 3.180527 | 1.189727 | 2.077633 | 0.593567 |
| 3165 | 50 | 2 | 3 | 55 | 135 | Wisconsin | Waupaca County | 52410 | 52410 | 52422 | ... | 3.111756 | -2.241873 | 6.292687 | -0.441031 | -0.480617 | 3.359933 | -2.011937 | 6.561277 | -0.134227 | -0.173022 |
| 3166 | 50 | 2 | 3 | 55 | 137 | Wisconsin | Waushara County | 24496 | 24496 | 24506 | ... | 4.930022 | -2.404973 | -4.097017 | -4.906711 | -4.397793 | 5.174486 | -2.160399 | -3.810226 | -4.535615 | -4.024395 |
| 3167 | 50 | 2 | 3 | 55 | 139 | Wisconsin | Winnebago County | 166994 | 166994 | 167059 | ... | 0.316712 | 2.889873 | 0.833819 | -2.406192 | -4.557985 | 0.842573 | 3.502335 | 1.531624 | -1.545153 | -3.685304 |
| 3168 | 50 | 2 | 3 | 55 | 141 | Wisconsin | Wood County | 74749 | 74749 | 74807 | ... | -4.081523 | -5.019090 | -6.901200 | -5.596471 | -3.958322 | -3.733590 | -4.562809 | -6.442917 | -5.040889 | -3.414223 |
| 3169 | 40 | 4 | 8 | 56 | 0 | Wyoming | Wyoming | 563626 | 563767 | 564516 | ... | -0.381530 | 9.636214 | 4.487115 | -4.788275 | -3.221091 | 0.289680 | 10.694870 | 5.440390 | -3.727831 | -2.091573 |
| 3170 | 50 | 4 | 8 | 56 | 1 | Wyoming | Albany County | 36299 | 36299 | 36428 | ... | 3.708956 | 2.637812 | -3.544634 | -3.334877 | -9.911169 | 6.736119 | 6.433032 | 0.719587 | 1.429233 | -5.166460 |
| 3171 | 50 | 4 | 8 | 56 | 3 | Wyoming | Big Horn County | 11668 | 11668 | 11672 | ... | 4.868258 | 2.804930 | 16.815908 | -8.026420 | 5.095861 | 4.868258 | 3.144921 | 17.236306 | -7.608378 | 5.513554 |
| 3172 | 50 | 4 | 8 | 56 | 5 | Wyoming | Campbell County | 46133 | 46133 | 46244 | ... | -2.843479 | 15.601020 | -5.895711 | -8.550911 | 10.916963 | -2.649606 | 15.558684 | -5.916543 | -8.509402 | 10.978525 |
| 3173 | 50 | 4 | 8 | 56 | 7 | Wyoming | Carbon County | 15885 | 15885 | 15837 | ... | -7.581980 | -13.081441 | 3.178134 | -2.970641 | -23.300971 | -7.392431 | -12.636926 | 3.623073 | -2.338590 | -22.600668 |
| 3174 | 50 | 4 | 8 | 56 | 9 | Wyoming | Converse County | 13833 | 13833 | 13826 | ... | -12.847499 | 15.493820 | 19.035533 | -20.550587 | -0.070403 | -12.774915 | 16.502720 | 20.093063 | -19.358233 | 1.126443 |
| 3175 | 50 | 4 | 8 | 56 | 11 | Wyoming | Crook County | 7083 | 7083 | 7114 | ... | -1.544618 | -4.202564 | 1.397819 | 6.378258 | 18.629317 | -0.982939 | -3.642222 | 2.096729 | 7.071547 | 19.309219 |
| 3176 | 50 | 4 | 8 | 56 | 13 | Wyoming | Fremont County | 40123 | 40123 | 40222 | ... | 2.747083 | 7.782673 | -4.990688 | -12.331633 | -13.673610 | 3.093562 | 8.027411 | -4.747240 | -12.013555 | -13.352750 |
| 3177 | 50 | 4 | 8 | 56 | 15 | Wyoming | Goshen County | 13249 | 13247 | 13408 | ... | 14.293649 | 3.961413 | -8.079028 | -7.017803 | -11.899450 | 14.886132 | 4.841727 | -6.903896 | -5.761986 | -10.635133 |
| 3178 | 50 | 4 | 8 | 56 | 17 | Wyoming | Hot Springs County | 4812 | 4812 | 4813 | ... | 3.322604 | 6.208609 | 3.095336 | -6.017222 | -5.454164 | 5.191569 | 6.001656 | 2.888981 | -6.224712 | -5.663940 |
| 3179 | 50 | 4 | 8 | 56 | 19 | Wyoming | Johnson County | 8569 | 8569 | 8581 | ... | 4.995063 | -4.058912 | -0.812583 | -10.715742 | 0.933652 | 5.227392 | -4.058912 | -0.812583 | -10.715742 | 0.933652 |
| 3180 | 50 | 4 | 8 | 56 | 21 | Wyoming | Laramie County | 91738 | 91881 | 92271 | ... | -1.200428 | 15.547274 | 4.787847 | -1.226133 | 0.278940 | -0.973320 | 17.914554 | 6.003143 | -0.207819 | 1.673640 |
| 3181 | 50 | 4 | 8 | 56 | 23 | Wyoming | Lincoln County | 18106 | 18106 | 18091 | ... | -9.802564 | -11.566801 | 13.564556 | 6.125989 | 1.555544 | -9.691801 | -11.566801 | 13.619696 | 6.234414 | 1.662823 |
| 3182 | 50 | 4 | 8 | 56 | 25 | Wyoming | Natrona County | 75450 | 75450 | 75472 | ... | 7.189319 | 23.066162 | 24.322042 | -0.958472 | -0.061057 | 7.689674 | 23.749508 | 25.085233 | -0.110593 | 0.793743 |
| 3183 | 50 | 4 | 8 | 56 | 27 | Wyoming | Niobrara County | 2484 | 2484 | 2492 | ... | -0.401849 | 0.806452 | 29.066295 | -12.603387 | 7.492114 | -0.401849 | 0.806452 | 29.066295 | -12.603387 | 7.492114 |
| 3184 | 50 | 4 | 8 | 56 | 29 | Wyoming | Park County | 28205 | 28205 | 28259 | ... | 4.582951 | 8.057765 | 7.641997 | -9.252437 | -2.878980 | 6.486639 | 11.127389 | 10.877797 | -5.585731 | 0.856839 |
| 3185 | 50 | 4 | 8 | 56 | 31 | Wyoming | Platte County | 8667 | 8667 | 8678 | ... | 4.373094 | 5.392073 | 2.634593 | 6.055759 | 4.662270 | 4.373094 | 4.933173 | 2.176403 | 5.598720 | 4.207414 |
| 3186 | 50 | 4 | 8 | 56 | 33 | Wyoming | Sheridan County | 29116 | 29116 | 29146 | ... | 0.958559 | 8.425487 | 4.546373 | 3.678069 | -3.298406 | 2.122524 | 9.342778 | 5.523001 | 4.781489 | -2.198937 |
| 3187 | 50 | 4 | 8 | 56 | 35 | Wyoming | Sublette County | 10247 | 10247 | 10244 | ... | -23.741784 | 15.272374 | -40.870074 | -16.596273 | -22.870900 | -21.092907 | 16.828794 | -39.211861 | -14.409938 | -20.664059 |
| 3188 | 50 | 4 | 8 | 56 | 37 | Wyoming | Sweetwater County | 43806 | 43806 | 43593 | ... | 1.072643 | 16.243199 | -5.339774 | -14.252889 | -14.248864 | 1.255221 | 16.243199 | -5.295460 | -14.075283 | -14.070195 |
| 3189 | 50 | 4 | 8 | 56 | 39 | Wyoming | Teton County | 21294 | 21294 | 21297 | ... | -1.589565 | 0.972695 | 19.525929 | 14.143021 | -0.564849 | 0.654527 | 2.408578 | 21.160658 | 16.308671 | 1.520747 |
| 3190 | 50 | 4 | 8 | 56 | 41 | Wyoming | Uinta County | 21118 | 21118 | 21102 | ... | -17.755986 | -4.916350 | -6.902954 | -14.215862 | -12.127022 | -18.136812 | -5.536861 | -7.521840 | -14.740608 | -12.606351 |
| 3191 | 50 | 4 | 8 | 56 | 43 | Wyoming | Washakie County | 8533 | 8533 | 8545 | ... | -11.637475 | -0.827815 | -2.013502 | -17.781491 | 1.682288 | -11.990126 | -1.182592 | -2.250385 | -18.020168 | 1.441961 |
| 3192 | 50 | 4 | 8 | 56 | 45 | Wyoming | Weston County | 7208 | 7208 | 7181 | ... | -11.752361 | -8.040059 | 12.372583 | 1.533635 | 6.935294 | -12.032179 | -8.040059 | 12.372583 | 1.533635 | 6.935294 |
3193 rows × 100 columns
2010年中 以每个州(STNAME)人口中 最多的 3 个县的人口总和为这个州人口的衡量标准,问哪 3 个州对应的人口最多?
DataFrame(census_df.sort_values(['STNAME', 'POPESTIMATE2010'],ascending=[True,False]).groupby(['STNAME']).head(3).groupby('STNAME')['POPESTIMATE2010'].sum().sort_values(ascending=False).head(3))Out[329]:
| POPESTIMATE2010 | |
|---|---|
| STNAME | |
| California | 50264474 |
| Texas | 31726014 |
| New York | 24148828 |
各个州、县 在 2010 年至 2012 年间人口平均量是怎么样的?
In [351]:
temp = census_df[["POPESTIMATE2010","POPESTIMATE2011","POPESTIMATE2012"]]
# census_df["three-year-total"] = temp.sum(axis=1) #axis 0为列,1为行
census_df["three-year-avg"] = temp.mean(axis=1)
# census_df[['STNAME', 'CTYNAME', 'three-year-total', 'three-year-avg']]
census_df[['STNAME', 'CTYNAME', 'three-year-avg']].head(10)Out[351]:
| STNAME | CTYNAME | three-year-avg | |
|---|---|---|---|
| 0 | Alabama | Alabama | 4.800786e+06 |
| 1 | Alabama | Autauga County | 5.502933e+04 |
| 2 | Alabama | Baldwin County | 1.867493e+05 |
| 3 | Alabama | Barbour County | 2.724200e+04 |
| 4 | Alabama | Bibb County | 2.274533e+04 |
| 5 | Alabama | Blount County | 5.762000e+04 |
| 6 | Alabama | Bullock County | 1.070733e+04 |
| 7 | Alabama | Butler County | 2.067500e+04 |
| 8 | Alabama | Calhoun County | 1.178303e+05 |
| 9 | Alabama | Chambers County | 3.405533e+04 |
在 2010 年至 2012 年间人口增加数量最多的是哪3个县?
In [352]:
census_df['Δ/➖'] = census_df['POPESTIMATE2012'] - census_df['POPESTIMATE2010']
DataFrame(census_df[['STNAME', 'CTYNAME', 'Δ/➖']].sort_values('Δ/➖', ascending=False).head(3))Out[352]:
| STNAME | CTYNAME | Δ/➖ | |
|---|---|---|---|
| 2566 | Texas | Texas | 845378 |
| 190 | California | California | 721976 |
| 329 | Florida | Florida | 502131 |
yuehui.163.com 数据分析
In [1041]:
girls_df = pd.read_csv('./data/girls', encoding='utf-8', sep='::', engine='python')
girls_df.head(10)Out[1041]:
| 会员ID | 昵称 | 年龄 | 省市 | 市区县 | 星座 | 学历 | 学历转分数 | 工资 | 婚否 | 身高 | 体重 | 职业 | 性别 | 头像 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 30135985 | 做个小女人 | 47 | 北京 | 朝阳区 | 狮子座 | 本科 | 60 | 9000 | 1 | 162 | 70 | 其他行业 | 0 | http://yuehui1.nos.netease.com/images/portrait... |
| 1 | 37827353 | 云海 | 35 | 北京 | 西城区 | 摩羯座 | 硕士 | 70 | 0 | 1 | 169 | 55 | 主管/经理 | 0 | http://yuehui1.nos.netease.com/images/portrait... |
| 2 | 47845164 | 紫色柔情 | 47 | 北京 | 海淀区 | 摩羯座 | 本科 | 60 | 6500 | 1 | 163 | 55 | 教师 | 0 | http://yuehui1.nos.netease.com/images/portrait... |
| 3 | 615430496 | 回眸一笑 | 48 | 北京 | 海淀区 | 摩羯座 | 本科 | 60 | 6500 | 1 | 163 | 54 | 教师 | 0 | http://yuehui1.nos.netease.com/images/portrait... |
| 4 | 615995687 | 空谷幽兰 | 47 | 北京 | 海淀区 | 摩羯座 | 本科 | 60 | 6500 | 1 | 163 | 55 | 教师 | 0 | http://yuehui1.nos.netease.com/images/portrait... |
| 5 | 32714149 | 风中百合 | 65 | 北京 | 西城区 | 射手座 | 本科 | 60 | 6500 | 1 | 163 | 50 | 其他行业 | 0 | http://yuehui1.nos.netease.com/images/portrait... |
| 6 | 15735842 | 小草 | 55 | 北京 | 空 | 水瓶座 | 本科 | 60 | 2500 | 1 | 165 | 58 | 工程师 | 0 | http://yuehui1.nos.netease.com/images/portrait... |
| 7 | 700172079 | 回眸一笑 | 50 | 北京 | 朝阳区 | 天蝎座 | 专科 | 50 | 9000 | 1 | 163 | 60 | 传媒/影视 | 0 | http://yuehui1.nos.netease.com/images/portrait... |
| 8 | 19335879 | 雨后彩虹 | 56 | 北京 | 朝阳区 | 摩羯座 | 专科 | 50 | 6500 | 1 | 168 | 63 | 其他行业 | 0 | http://yuehui1.nos.netease.com/images/portrait... |
| 9 | 30493341 | 烟雨蒙蒙 | 46 | 北京 | 东城区 | 水瓶座 | 本科 | 60 | 0 | 1 | 160 | 59 | 其他行业 | 0 | http://yuehui1.nos.netease.com/images/portrait... |
-1. 得到男女生数据总和
In [1042]:
sex_count = DataFrame(girls_df.groupby('性别').count()['会员ID'])
sex_count.rename(columns={'会员ID': '人数'}, inplace=True)
sex_countOut[1042]:
| 人数 | |
|---|---|
| 性别 | |
| 0 | 25484 |
| 1 | 19197 |
0. 分析每个地区的男女生比例
In [1043]:
temp_copy = girls_df
# 改名
temp_copy['性别'] = girls_df['性别'].replace([0,1],['女','男'])
# 总和
total = temp_copy.groupby('省市')[['会员ID']].count()
# 人数 / 总和
province_sex = temp_copy.groupby(['省市', '性别'])[['会员ID']].count()
evey_province_sex_percent = province_sex / total
evey_province_sex_percent.rename(columns={'会员ID': '人数所占百分比'}, inplace=True)
evey_province_sex_percentOut[1043]:
| 人数所占百分比 | ||
|---|---|---|
| 省市 | 性别 | |
| 上海 | 女 | 0.821717 |
| 男 | 0.178283 | |
| 乌鲁木齐 | 女 | 0.413987 |
| 男 | 0.586013 | |
| 兰州 | 女 | 0.473684 |
| 男 | 0.526316 | |
| 北京 | 女 | 0.819867 |
| 男 | 0.180133 | |
| 南京 | 女 | 0.571429 |
| 男 | 0.428571 | |
| 南宁 | 女 | 0.583333 |
| 男 | 0.416667 | |
| 南昌 | 女 | 0.363636 |
| 男 | 0.636364 | |
| 台湾 | 女 | 0.615385 |
| 男 | 0.384615 | |
| 合肥 | 女 | 0.461538 |
| 男 | 0.538462 | |
| 呼和浩特 | 女 | 0.615385 |
| 男 | 0.384615 | |
| 哈尔滨 | 女 | 0.552898 |
| 男 | 0.447102 | |
| 天津 | 女 | 0.872491 |
| 男 | 0.127509 | |
| 太原 | 女 | 0.600000 |
| 男 | 0.400000 | |
| 广州 | 女 | 0.600000 |
| 男 | 0.400000 | |
| 成都 | 女 | 0.388889 |
| 男 | 0.611111 | |
| ... | ... | ... |
| 沈阳 | 女 | 0.625578 |
| 男 | 0.374422 | |
| 济南 | 女 | 0.700000 |
| 男 | 0.300000 | |
| 海口 | 女 | 0.475000 |
| 男 | 0.525000 | |
| 澳门 | 女 | 0.333333 |
| 男 | 0.666667 | |
| 石家庄 | 女 | 0.700000 |
| 男 | 0.300000 | |
| 福州 | 女 | 0.416667 |
| 男 | 0.583333 | |
| 西宁 | 女 | 0.526316 |
| 男 | 0.473684 | |
| 西安 | 女 | 0.546218 |
| 男 | 0.453782 | |
| 贵阳 | 女 | 0.666667 |
| 男 | 0.333333 | |
| 郑州 | 女 | 0.413043 |
| 男 | 0.586957 | |
| 重庆 | 女 | 0.663551 |
| 男 | 0.336449 | |
| 银川 | 女 | 0.459550 |
| 男 | 0.540450 | |
| 长春 | 女 | 0.500000 |
| 男 | 0.500000 | |
| 长沙 | 女 | 0.526316 |
| 男 | 0.473684 | |
| 香港 | 女 | 0.410324 |
| 男 | 0.589676 |
68 rows × 1 columns
1. 查看那5个星座发布的交友信息最多
In [1044]:
# 计数
mostPostStart = DataFrame(girls_df.groupby('星座').count()['会员ID'])
# 改一下列名
mostPostStart.rename(columns={'会员ID': '发部次数'}, inplace=True)
# 输出前5
mostPostStart.sort_values(by='发部次数', ascending=False).head(5)Out[1044]:
| 发部次数 | |
|---|---|
| 星座 | |
| 摩羯座 | 6053 |
| 双子座 | 5331 |
| 天秤座 | 3806 |
| 金牛座 | 3492 |
| 巨蟹座 | 3422 |
2. 查看薪资排名前10 的省份
薪资为保密,0,空值的不进行计算
薪资在区间内的取其均值
50000以上的视为50000 2000以下的视为2000
In [1045]:
# 过滤掉工资为 -1 的行 不统计
have_income = girls_df[~girls_df['工资'].isin([-1])]
# 省份人数总工资 / 计数
DataFrame(have_income.groupby('省市').sum()['工资'] / girls_df.groupby('省市').count()['会员ID'], columns=['平均薪资']).sort_values(by='平均薪资', ascending=False).head(10)Out[1045]:
| 平均薪资 | |
|---|---|
| 省市 | |
| 香港 | 14634.017207 |
| 澳门 | 12677.777778 |
| 台湾 | 10237.891738 |
| 杭州 | 9697.530864 |
| 成都 | 9047.668038 |
| 北京 | 8617.494440 |
| 南京 | 8280.864198 |
| 福州 | 8011.316872 |
| 长沙 | 7850.227420 |
| 广州 | 7762.962963 |
3. 查看哪10个年龄的交友需求比较多 (过滤掉所有无效数据)
In [1046]:
# 计数
mostPostAge = DataFrame(girls_df.groupby('年龄').count()['会员ID'].sort_values(ascending=False))
mostPostAge.rename(columns={'会员ID': '发布次数'}, inplace=True)
mostPostAge.head(10)Out[1046]:
| 发布次数 | |
|---|---|
| 年龄 | |
| 36 | 2134 |
| 37 | 2054 |
| 34 | 1925 |
| 32 | 1913 |
| 29 | 1898 |
| 33 | 1855 |
| 35 | 1852 |
| 31 | 1846 |
| 39 | 1839 |
| 30 | 1793 |
4. 得到各个年龄段的男女的人数
In [1047]:
# 创建副本
temp_copy = girls_df
temp_copy['性别']=temp_copy['性别'].replace([0,1],['女','男'])
# 按年龄 性别 计算数量
all_age_count = DataFrame(DataFrame(temp_copy.groupby(['年龄', '性别']).count())['会员ID'])
all_age_count.rename(columns={'会员ID': '人数'}, inplace=True)
all_age_countOut[1047]:
| 人数 | ||
|---|---|---|
| 年龄 | 性别 | |
| 18 | 女 | 99 |
| 男 | 68 | |
| 19 | 女 | 135 |
| 男 | 88 | |
| 20 | 女 | 132 |
| 男 | 108 | |
| 21 | 女 | 216 |
| 男 | 116 | |
| 22 | 女 | 456 |
| 男 | 257 | |
| 23 | 女 | 326 |
| 男 | 132 | |
| 24 | 女 | 472 |
| 男 | 154 | |
| 25 | 女 | 495 |
| 男 | 179 | |
| 26 | 女 | 824 |
| 男 | 206 | |
| 27 | 女 | 903 |
| 男 | 252 | |
| 28 | 女 | 1103 |
| 男 | 358 | |
| 29 | 女 | 1344 |
| 男 | 554 | |
| 30 | 女 | 1251 |
| 男 | 542 | |
| 31 | 女 | 1255 |
| 男 | 591 | |
| 32 | 女 | 1201 |
| 男 | 712 | |
| ... | ... | ... |
| 74 | 女 | 2 |
| 男 | 2 | |
| 75 | 男 | 6 |
| 76 | 女 | 11 |
| 男 | 6 | |
| 77 | 女 | 2 |
| 男 | 5 | |
| 78 | 女 | 17 |
| 男 | 6 | |
| 79 | 女 | 16 |
| 男 | 15 | |
| 80 | 女 | 15 |
| 男 | 17 | |
| 81 | 女 | 4 |
| 男 | 12 | |
| 82 | 男 | 17 |
| 83 | 女 | 14 |
| 男 | 11 | |
| 84 | 女 | 15 |
| 男 | 29 | |
| 85 | 女 | 23 |
| 男 | 17 | |
| 86 | 女 | 11 |
| 男 | 9 | |
| 87 | 女 | 8 |
| 男 | 6 | |
| 88 | 女 | 59 |
| 男 | 45 | |
| 89 | 女 | 142 |
| 男 | 107 |
142 rows × 1 columns
5. 各个学历的占比
In [1076]:
# 合并 空 和 高中以下学历
# girls_df['学历']=girls_df['学历'].replace(['空'],['高中以下'])
# degree_percent.groupby(['学历']).count()
# temp_copy.groupby(['学历']).count()
# 计算 每个学历 对应的总数
degree_temp = DataFrame(DataFrame(temp_copy.groupby(['学历']).count())['会员ID'])
# degree_temp
# 换列名
degree_temp.rename(columns={'会员ID': '百分比'}, inplace=True)
# degree_temp
# degree_temp['学历']['高中以下'] + degree_temp['学历']['空']
# 按列计算每个元素所占百分比
DataFrame(degree_temp['百分比'].apply(lambda x :str((x/degree_temp['百分比'].sum()) * 100) + '%')).sort_values(by='百分比', ascending=False)Out[1076]:
| 百分比 | |
|---|---|
| 学历 | |
| 高中 | 8.690494841207672% |
| 高中以下 | 8.430876658982566% |
| 空 | 6.049551263400551% |
| 硕士 | 5.049126026722768% |
| 本科 | 46.08894160828988% |
| 专科 | 24.03706273360041% |
| 博士 | 1.6539468677961549% |
6. 得出前10个平均学历最高的城市
无学历的默认为高中学历
假设学历满分为100
高中以下30分,高中40分,专科50分,本科60分,硕士70分,博士80分
In [1049]:
degree_height = girls_df.groupby(['省市', '市区县']).mean().sort_values(by='学历转分数', ascending=False)[['学历转分数']]
# 换列名
degree_height.rename(columns={'学历转分数': '平均学历分数'}, inplace=True)
degree_height.head(10)Out[1049]:
| 平均学历分数 | ||
|---|---|---|
| 省市 | 市区县 | |
| 哈尔滨 | 宁安市 | 80.000000 |
| 抚远县 | 70.000000 | |
| 南宁 | 隆安县 | 66.666667 |
| 石家庄 | 元氏县 | 60.000000 |
| 哈尔滨 | 东宁县 | 60.000000 |
| 木兰县 | 60.000000 | |
| 石家庄 | 栾城县 | 60.000000 |
| 南京 | 空 | 58.252427 |
| 白下区 | 58.000000 | |
| 哈尔滨 | 让胡路区 | 58.000000 |
7. 得到身材相对匀称 人数最多的10个省份
按照BMI指数得到结果
按照:70kg÷(1.75×1.75)=22.86)的比例
结果 如果在18.5-23.9之间的 属于标准身材
In [1067]:
body_thin = girls_df
# body_thin.copy()
# 过滤掉 身高 / 体重 为 0 的行 不统计
body_thin = body_thin[~body_thin['身高'].isin([0])]
body_thin = body_thin.copy()
body_thin['BMI'] = body_thin['体重'] / (body_thin['身高'] * body_thin['身高'] / 10000)
thin_filter = body_thin[(body_thin['BMI'] >= 18.5) & (body_thin['BMI'] <= 23.9)]
thin_temp = thin_filter.groupby('省市').count()[['会员ID']].sort_values(by='会员ID', ascending=False).head(10)
thin_tempOut[1067]:
| 会员ID | |
|---|---|
| 省市 | |
| 北京 | 1657 |
| 上海 | 1492 |
| 天津 | 1271 |
| 重庆 | 1037 |
| 成都 | 915 |
| 长沙 | 880 |
| 兰州 | 804 |
| 西宁 | 778 |
| 西安 | 771 |
| 香港 | 739 |
8. 三个图
身高 体重散点图
top10 薪资省份(图任选)
满足bmi指数的人数最多的10个省份(柱状图,类似效果)
In [1051]:
temp = girls_df
temp = temp[(girls_df['身高'] < 240) & (girls_df['体重'] < 400)]
plt.scatter(temp['身高'], temp['体重'])Out[1051]:
<matplotlib.collections.PathCollection at 0x1d418b075f8>
# 过滤掉工资为 -1 的行 不统计
# 省份人数总工资 / 计数
income_10 = DataFrame(girls_df[~girls_df['工资'].isin([-1])].groupby('省市').sum()['工资'] / girls_df.groupby('省市').count()['会员ID'], columns=['平均薪资']).sort_values(by='平均薪资', ascending=False).head(10)
plt.bar(Series(income_10.index), income_10['平均薪资'], width=0.5)
plt.show()
plt.bar(Series(thin_temp.index), thin_temp['会员ID'], width=0.5)
plt.show()
提供云存储服务