In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [5]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
In [6]:
s
Out[6]:
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
In [7]:
dates = pd.date_range('20220101', '20220108')
dates
Out[7]:
DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08'],
              dtype='datetime64[ns]', freq='D')
In [8]:
df = pd.DataFrame(np.random.randn(len(dates), 4), index=dates, columns=list("ABCD"))
df
Out[8]:
A B C D
2022-01-01 -0.631687 1.188095 1.338382 -0.551332
2022-01-02 1.409102 -1.883717 -0.228207 0.536107
2022-01-03 1.067096 0.835134 1.201175 -1.649642
2022-01-04 0.968663 -0.581464 0.295065 -1.619879
2022-01-05 0.239929 -2.513251 1.112255 -0.643571
2022-01-06 -2.429498 -1.451654 -0.639179 -2.947974
2022-01-07 -0.461846 0.771496 -0.100905 -1.080324
2022-01-08 0.271669 -0.059790 -0.029706 -2.394378
In [10]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
df2
Out[10]:
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
In [11]:
df2.dtypes
Out[11]:
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
In [19]:
df.head(1)
Out[19]:
A B C D
2022-01-01 -0.631687 1.188095 1.338382 -0.551332
In [21]:
df.tail(2)
Out[21]:
A B C D
2022-01-07 -0.461846 0.771496 -0.100905 -1.080324
2022-01-08 0.271669 -0.059790 -0.029706 -2.394378
In [22]:
df.index
Out[22]:
DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08'],
              dtype='datetime64[ns]', freq='D')
In [23]:
df.columns
Out[23]:
Index(['A', 'B', 'C', 'D'], dtype='object')
In [29]:
print(df.axes[0][1])
2022-01-02 00:00:00
In [ ]:
 
In [34]:
print(df)
df.hist()
                   A         B         C         D
2022-01-01 -0.631687  1.188095  1.338382 -0.551332
2022-01-02  1.409102 -1.883717 -0.228207  0.536107
2022-01-03  1.067096  0.835134  1.201175 -1.649642
2022-01-04  0.968663 -0.581464  0.295065 -1.619879
2022-01-05  0.239929 -2.513251  1.112255 -0.643571
2022-01-06 -2.429498 -1.451654 -0.639179 -2.947974
2022-01-07 -0.461846  0.771496 -0.100905 -1.080324
2022-01-08  0.271669 -0.059790 -0.029706 -2.394378
Out[34]:
array([[<AxesSubplot:title={'center':'A'}>,
        <AxesSubplot:title={'center':'B'}>],
       [<AxesSubplot:title={'center':'C'}>,
        <AxesSubplot:title={'center':'D'}>]], dtype=object)
No description has been provided for this image
In [35]:
df.values
Out[35]:
array([[-0.63168745,  1.1880953 ,  1.33838185, -0.55133226],
       [ 1.40910215, -1.88371741, -0.22820721,  0.53610691],
       [ 1.06709582,  0.8351338 ,  1.20117467, -1.64964158],
       [ 0.96866279, -0.58146391,  0.29506536, -1.6198791 ],
       [ 0.23992856, -2.51325115,  1.1122553 , -0.64357132],
       [-2.42949833, -1.45165431, -0.63917879, -2.94797382],
       [-0.46184615,  0.77149624, -0.10090492, -1.08032395],
       [ 0.2716691 , -0.05979006, -0.0297059 , -2.39437784]])
In [36]:
df.describe()
Out[36]:
A B C D
count 8.000000 8.000000 8.000000 8.000000
mean 0.054178 -0.461894 0.368610 -1.293874
std 1.235502 1.378988 0.750521 1.105099
min -2.429498 -2.513251 -0.639179 -2.947974
25% -0.504306 -1.559670 -0.132730 -1.835826
50% 0.255799 -0.320627 0.132680 -1.350102
75% 0.993271 0.787406 1.134485 -0.620512
max 1.409102 1.188095 1.338382 0.536107
In [40]:
df.T
Out[40]:
2022-01-01 2022-01-02 2022-01-03 2022-01-04 2022-01-05 2022-01-06 2022-01-07 2022-01-08
A -0.631687 1.409102 1.067096 0.968663 0.239929 -2.429498 -0.461846 0.271669
B 1.188095 -1.883717 0.835134 -0.581464 -2.513251 -1.451654 0.771496 -0.059790
C 1.338382 -0.228207 1.201175 0.295065 1.112255 -0.639179 -0.100905 -0.029706
D -0.551332 0.536107 -1.649642 -1.619879 -0.643571 -2.947974 -1.080324 -2.394378
In [41]:
df.sort_index(axis=1, ascending=False)
Out[41]:
D C B A
2022-01-01 -0.551332 1.338382 1.188095 -0.631687
2022-01-02 0.536107 -0.228207 -1.883717 1.409102
2022-01-03 -1.649642 1.201175 0.835134 1.067096
2022-01-04 -1.619879 0.295065 -0.581464 0.968663
2022-01-05 -0.643571 1.112255 -2.513251 0.239929
2022-01-06 -2.947974 -0.639179 -1.451654 -2.429498
2022-01-07 -1.080324 -0.100905 0.771496 -0.461846
2022-01-08 -2.394378 -0.029706 -0.059790 0.271669
In [48]:
df.sort_values(by="A", ascending=False)
Out[48]:
A B C D
2022-01-02 1.409102 -1.883717 -0.228207 0.536107
2022-01-03 1.067096 0.835134 1.201175 -1.649642
2022-01-04 0.968663 -0.581464 0.295065 -1.619879
2022-01-08 0.271669 -0.059790 -0.029706 -2.394378
2022-01-05 0.239929 -2.513251 1.112255 -0.643571
2022-01-07 -0.461846 0.771496 -0.100905 -1.080324
2022-01-01 -0.631687 1.188095 1.338382 -0.551332
2022-01-06 -2.429498 -1.451654 -0.639179 -2.947974
In [61]:
df['20220104':'20220106']
Out[61]:
A B C D
2022-01-04 0.968663 -0.581464 0.295065 -1.619879
2022-01-05 0.239929 -2.513251 1.112255 -0.643571
2022-01-06 -2.429498 -1.451654 -0.639179 -2.947974
In [68]:
df.loc[:, ['A', 'D']]
Out[68]:
A D
2022-01-01 -0.631687 -0.551332
2022-01-02 1.409102 0.536107
2022-01-03 1.067096 -1.649642
2022-01-04 0.968663 -1.619879
2022-01-05 0.239929 -0.643571
2022-01-06 -2.429498 -2.947974
2022-01-07 -0.461846 -1.080324
2022-01-08 0.271669 -2.394378
In [70]:
df.loc['20220104':'20220106',['A','B']]
Out[70]:
A B
2022-01-04 0.968663 -0.581464
2022-01-05 0.239929 -2.513251
2022-01-06 -2.429498 -1.451654
In [71]:
df.at[dates[1], 'A']
Out[71]:
1.4091021538454698
In [72]:
df[df.A > 0]
Out[72]:
A B C D
2022-01-02 1.409102 -1.883717 -0.228207 0.536107
2022-01-03 1.067096 0.835134 1.201175 -1.649642
2022-01-04 0.968663 -0.581464 0.295065 -1.619879
2022-01-05 0.239929 -2.513251 1.112255 -0.643571
2022-01-08 0.271669 -0.059790 -0.029706 -2.394378
In [74]:
df2 = df.copy()

df2['E'] = ['one', 'one','two','three','four','three', 'four','three']
df2
Out[74]:
A B C D E
2022-01-01 -0.631687 1.188095 1.338382 -0.551332 one
2022-01-02 1.409102 -1.883717 -0.228207 0.536107 one
2022-01-03 1.067096 0.835134 1.201175 -1.649642 two
2022-01-04 0.968663 -0.581464 0.295065 -1.619879 three
2022-01-05 0.239929 -2.513251 1.112255 -0.643571 four
2022-01-06 -2.429498 -1.451654 -0.639179 -2.947974 three
2022-01-07 -0.461846 0.771496 -0.100905 -1.080324 four
2022-01-08 0.271669 -0.059790 -0.029706 -2.394378 three
In [75]:
df2[df2['E'].isin(('two', 'four'))]
Out[75]:
A B C D E
2022-01-03 1.067096 0.835134 1.201175 -1.649642 two
2022-01-05 0.239929 -2.513251 1.112255 -0.643571 four
2022-01-07 -0.461846 0.771496 -0.100905 -1.080324 four
In [78]:
df.at[dates[0],'A'] = 0
df
Out[78]:
A B C D
2022-01-01 0.000000 1.188095 1.338382 -0.551332
2022-01-02 1.409102 -1.883717 -0.228207 0.536107
2022-01-03 1.067096 0.835134 1.201175 -1.649642
2022-01-04 0.968663 -0.581464 0.295065 -1.619879
2022-01-05 0.239929 -2.513251 1.112255 -0.643571
2022-01-06 -2.429498 -1.451654 -0.639179 -2.947974
2022-01-07 -0.461846 0.771496 -0.100905 -1.080324
2022-01-08 0.271669 -0.059790 -0.029706 -2.394378
In [79]:
df.iat[0,1] = 0
df
Out[79]:
A B C D
2022-01-01 0.000000 0.000000 1.338382 -0.551332
2022-01-02 1.409102 -1.883717 -0.228207 0.536107
2022-01-03 1.067096 0.835134 1.201175 -1.649642
2022-01-04 0.968663 -0.581464 0.295065 -1.619879
2022-01-05 0.239929 -2.513251 1.112255 -0.643571
2022-01-06 -2.429498 -1.451654 -0.639179 -2.947974
2022-01-07 -0.461846 0.771496 -0.100905 -1.080324
2022-01-08 0.271669 -0.059790 -0.029706 -2.394378
In [80]:
df.iat[0,3] = 0
df
Out[80]:
A B C D
2022-01-01 0.000000 0.000000 1.338382 0.000000
2022-01-02 1.409102 -1.883717 -0.228207 0.536107
2022-01-03 1.067096 0.835134 1.201175 -1.649642
2022-01-04 0.968663 -0.581464 0.295065 -1.619879
2022-01-05 0.239929 -2.513251 1.112255 -0.643571
2022-01-06 -2.429498 -1.451654 -0.639179 -2.947974
2022-01-07 -0.461846 0.771496 -0.100905 -1.080324
2022-01-08 0.271669 -0.059790 -0.029706 -2.394378
In [83]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20220102', periods=6))
df['F'] = s1
df
Out[83]:
A B C D F
2022-01-01 0.000000 0.000000 1.338382 0.000000 NaN
2022-01-02 1.409102 -1.883717 -0.228207 0.536107 1.0
2022-01-03 1.067096 0.835134 1.201175 -1.649642 2.0
2022-01-04 0.968663 -0.581464 0.295065 -1.619879 3.0
2022-01-05 0.239929 -2.513251 1.112255 -0.643571 4.0
2022-01-06 -2.429498 -1.451654 -0.639179 -2.947974 5.0
2022-01-07 -0.461846 0.771496 -0.100905 -1.080324 6.0
2022-01-08 0.271669 -0.059790 -0.029706 -2.394378 NaN
In [87]:
print(type(df.values[0, -1]), df.values[0, -1])
<class 'numpy.float64'> nan
In [89]:
df1 = df.copy()
df1.dropna(how='any')
Out[89]:
A B C D F
2022-01-02 1.409102 -1.883717 -0.228207 0.536107 1.0
2022-01-03 1.067096 0.835134 1.201175 -1.649642 2.0
2022-01-04 0.968663 -0.581464 0.295065 -1.619879 3.0
2022-01-05 0.239929 -2.513251 1.112255 -0.643571 4.0
2022-01-06 -2.429498 -1.451654 -0.639179 -2.947974 5.0
2022-01-07 -0.461846 0.771496 -0.100905 -1.080324 6.0
In [90]:
df1.fillna(value=5)
Out[90]:
A B C D F
2022-01-01 0.000000 0.000000 1.338382 0.000000 5.0
2022-01-02 1.409102 -1.883717 -0.228207 0.536107 1.0
2022-01-03 1.067096 0.835134 1.201175 -1.649642 2.0
2022-01-04 0.968663 -0.581464 0.295065 -1.619879 3.0
2022-01-05 0.239929 -2.513251 1.112255 -0.643571 4.0
2022-01-06 -2.429498 -1.451654 -0.639179 -2.947974 5.0
2022-01-07 -0.461846 0.771496 -0.100905 -1.080324 6.0
2022-01-08 0.271669 -0.059790 -0.029706 -2.394378 5.0
In [91]:
pd.isna(df1)
Out[91]:
A B C D F
2022-01-01 False False False False True
2022-01-02 False False False False False
2022-01-03 False False False False False
2022-01-04 False False False False False
2022-01-05 False False False False False
2022-01-06 False False False False False
2022-01-07 False False False False False
2022-01-08 False False False False True
In [99]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates[:6]).shift(2)
s
Out[99]:
2022-01-01    NaN
2022-01-02    NaN
2022-01-03    1.0
2022-01-04    3.0
2022-01-05    5.0
2022-01-06    NaN
Freq: D, dtype: float64
In [97]:
s
Out[97]:
2022-01-01    NaN
2022-01-02    NaN
2022-01-03    1.0
2022-01-04    3.0
2022-01-05    5.0
2022-01-06    NaN
Freq: D, dtype: float64
In [96]:
df.add(s, axis='index')
Out[96]:
A B C D F
2022-01-01 NaN NaN NaN NaN NaN
2022-01-02 NaN NaN NaN NaN NaN
2022-01-03 2.067096 1.835134 2.201175 -0.649642 3.0
2022-01-04 3.968663 2.418536 3.295065 1.380121 6.0
2022-01-05 5.239929 2.486749 6.112255 4.356429 9.0
2022-01-06 NaN NaN NaN NaN NaN
2022-01-07 NaN NaN NaN NaN NaN
2022-01-08 NaN NaN NaN NaN NaN
In [103]:
df.apply(lambda x: x.max() - x.min())
Out[103]:
A    3.838600
B    3.348385
C    1.977561
D    3.484081
F    5.000000
dtype: float64
In [105]:
df.apply(lambda x: x.max() - x.min(), axis=1)
Out[105]:
2022-01-01    1.338382
2022-01-02    3.292820
2022-01-03    3.649642
2022-01-04    4.619879
2022-01-05    6.513251
2022-01-06    7.947974
2022-01-07    7.080324
2022-01-08    2.666047
Freq: D, dtype: float64
In [106]:
s = pd.Series(np.random.randint(0, 7, size=10))
s
Out[106]:
0    5
1    0
2    3
3    1
4    0
5    3
6    3
7    0
8    3
9    5
dtype: int32
In [110]:
s.value_counts(ascending=True)
Out[110]:
1    1
5    2
0    3
3    4
dtype: int64
In [3]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                         'foo', 'bar', 'foo', 'foo'],
                   'B': ['one', 'one', 'two', 'three',
                         'two', 'two', 'one', 'three'],
                  'C': np.random.randn(8),
                   'D': np.random.randn(8)})
df
Out[3]:
A B C D
0 foo one -0.542044 -0.780312
1 bar one -1.685851 1.667171
2 foo two -2.512135 -1.103894
3 bar three -0.250959 -0.420705
4 foo two -3.222366 1.776841
5 bar two -1.228211 -0.396067
6 foo one 2.056563 -0.819939
7 foo three 0.819403 0.671113
In [114]:
df.groupby("A").sum()
Out[114]:
C D
A
bar 0.404722 0.857120
foo -0.315457 1.005343
In [115]:
df.groupby(['A','B']).sum()
Out[115]:
C D
A B
bar one 1.733757 -0.444961
three 0.222557 1.324174
two -1.551593 -0.022093
foo one -0.076205 -0.837857
three -0.645545 0.798104
two 0.406294 1.045096
In [116]:
df.to_csv('foo.csv')
In [118]:
pd.read_csv('foo.csv')
Out[118]:
Unnamed: 0 A B C D
0 0 foo one -0.545937 -0.334651
1 1 bar one 1.733757 -0.444961
2 2 foo two -0.006299 0.113370
3 3 bar three 0.222557 1.324174
4 4 foo two 0.412593 0.931726
5 5 bar two -1.551593 -0.022093
6 6 foo one 0.469732 -0.503207
7 7 foo three -0.645545 0.798104
In [4]:
df.to_hdf('foo.h5', 'df')
In [5]:
pd.read_hdf('foo.h5', 'df')
Out[5]:
A B C D
0 foo one -0.542044 -0.780312
1 bar one -1.685851 1.667171
2 foo two -2.512135 -1.103894
3 bar three -0.250959 -0.420705
4 foo two -3.222366 1.776841
5 bar two -1.228211 -0.396067
6 foo one 2.056563 -0.819939
7 foo three 0.819403 0.671113
In [9]:
pd.read_excel(r"C:\Users\Administrator\Desktop\雪球内容爬虫需求2022.1.10.xlsx", 0, engine="openpyxl")
Out[9]:
统计时间 股吧内码 基金代码 实时排名 股票中文名
0 2022-01-04 13:10:00 2432 SZ002432 1 九安医疗
1 2022-01-04 13:10:00 2256 SZ002256 2 兆新股份
2 2022-01-04 13:10:00 2665 SZ002665 3 首航高科
3 2022-01-04 13:10:00 2354 SZ002354 4 天神娱乐
4 2022-01-04 13:10:00 2349 SZ002349 5 精华制药
... ... ... ... ... ...
495 2022-01-04 13:10:00 600654 SH600654 496 ST中安
496 2022-01-04 13:10:00 2082 SZ002082 497 万邦德
497 2022-01-04 13:10:00 300025 SZ300025 498 华星创业
498 2022-01-04 13:10:00 2017 SZ002017 499 东信和平
499 2022-01-04 13:10:00 300267 SZ300267 500 尔康制药

500 rows × 5 columns