In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [5]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
In [6]:
s
Out[6]:
0 1.0 1 3.0 2 5.0 3 NaN 4 6.0 5 8.0 dtype: float64
In [7]:
dates = pd.date_range('20220101', '20220108')
dates
Out[7]:
DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04', '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08'], dtype='datetime64[ns]', freq='D')
In [8]:
df = pd.DataFrame(np.random.randn(len(dates), 4), index=dates, columns=list("ABCD"))
df
Out[8]:
A | B | C | D | |
---|---|---|---|---|
2022-01-01 | -0.631687 | 1.188095 | 1.338382 | -0.551332 |
2022-01-02 | 1.409102 | -1.883717 | -0.228207 | 0.536107 |
2022-01-03 | 1.067096 | 0.835134 | 1.201175 | -1.649642 |
2022-01-04 | 0.968663 | -0.581464 | 0.295065 | -1.619879 |
2022-01-05 | 0.239929 | -2.513251 | 1.112255 | -0.643571 |
2022-01-06 | -2.429498 | -1.451654 | -0.639179 | -2.947974 |
2022-01-07 | -0.461846 | 0.771496 | -0.100905 | -1.080324 |
2022-01-08 | 0.271669 | -0.059790 | -0.029706 | -2.394378 |
In [10]:
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
df2
Out[10]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
0 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
1 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
2 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
3 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
In [11]:
df2.dtypes
Out[11]:
A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object
In [19]:
df.head(1)
Out[19]:
A | B | C | D | |
---|---|---|---|---|
2022-01-01 | -0.631687 | 1.188095 | 1.338382 | -0.551332 |
In [21]:
df.tail(2)
Out[21]:
A | B | C | D | |
---|---|---|---|---|
2022-01-07 | -0.461846 | 0.771496 | -0.100905 | -1.080324 |
2022-01-08 | 0.271669 | -0.059790 | -0.029706 | -2.394378 |
In [22]:
df.index
Out[22]:
DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04', '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08'], dtype='datetime64[ns]', freq='D')
In [23]:
df.columns
Out[23]:
Index(['A', 'B', 'C', 'D'], dtype='object')
In [29]:
print(df.axes[0][1])
2022-01-02 00:00:00
In [ ]:
In [34]:
print(df)
df.hist()
A B C D 2022-01-01 -0.631687 1.188095 1.338382 -0.551332 2022-01-02 1.409102 -1.883717 -0.228207 0.536107 2022-01-03 1.067096 0.835134 1.201175 -1.649642 2022-01-04 0.968663 -0.581464 0.295065 -1.619879 2022-01-05 0.239929 -2.513251 1.112255 -0.643571 2022-01-06 -2.429498 -1.451654 -0.639179 -2.947974 2022-01-07 -0.461846 0.771496 -0.100905 -1.080324 2022-01-08 0.271669 -0.059790 -0.029706 -2.394378
Out[34]:
array([[<AxesSubplot:title={'center':'A'}>, <AxesSubplot:title={'center':'B'}>], [<AxesSubplot:title={'center':'C'}>, <AxesSubplot:title={'center':'D'}>]], dtype=object)
In [35]:
df.values
Out[35]:
array([[-0.63168745, 1.1880953 , 1.33838185, -0.55133226], [ 1.40910215, -1.88371741, -0.22820721, 0.53610691], [ 1.06709582, 0.8351338 , 1.20117467, -1.64964158], [ 0.96866279, -0.58146391, 0.29506536, -1.6198791 ], [ 0.23992856, -2.51325115, 1.1122553 , -0.64357132], [-2.42949833, -1.45165431, -0.63917879, -2.94797382], [-0.46184615, 0.77149624, -0.10090492, -1.08032395], [ 0.2716691 , -0.05979006, -0.0297059 , -2.39437784]])
In [36]:
df.describe()
Out[36]:
A | B | C | D | |
---|---|---|---|---|
count | 8.000000 | 8.000000 | 8.000000 | 8.000000 |
mean | 0.054178 | -0.461894 | 0.368610 | -1.293874 |
std | 1.235502 | 1.378988 | 0.750521 | 1.105099 |
min | -2.429498 | -2.513251 | -0.639179 | -2.947974 |
25% | -0.504306 | -1.559670 | -0.132730 | -1.835826 |
50% | 0.255799 | -0.320627 | 0.132680 | -1.350102 |
75% | 0.993271 | 0.787406 | 1.134485 | -0.620512 |
max | 1.409102 | 1.188095 | 1.338382 | 0.536107 |
In [40]:
df.T
Out[40]:
2022-01-01 | 2022-01-02 | 2022-01-03 | 2022-01-04 | 2022-01-05 | 2022-01-06 | 2022-01-07 | 2022-01-08 | |
---|---|---|---|---|---|---|---|---|
A | -0.631687 | 1.409102 | 1.067096 | 0.968663 | 0.239929 | -2.429498 | -0.461846 | 0.271669 |
B | 1.188095 | -1.883717 | 0.835134 | -0.581464 | -2.513251 | -1.451654 | 0.771496 | -0.059790 |
C | 1.338382 | -0.228207 | 1.201175 | 0.295065 | 1.112255 | -0.639179 | -0.100905 | -0.029706 |
D | -0.551332 | 0.536107 | -1.649642 | -1.619879 | -0.643571 | -2.947974 | -1.080324 | -2.394378 |
In [41]:
df.sort_index(axis=1, ascending=False)
Out[41]:
D | C | B | A | |
---|---|---|---|---|
2022-01-01 | -0.551332 | 1.338382 | 1.188095 | -0.631687 |
2022-01-02 | 0.536107 | -0.228207 | -1.883717 | 1.409102 |
2022-01-03 | -1.649642 | 1.201175 | 0.835134 | 1.067096 |
2022-01-04 | -1.619879 | 0.295065 | -0.581464 | 0.968663 |
2022-01-05 | -0.643571 | 1.112255 | -2.513251 | 0.239929 |
2022-01-06 | -2.947974 | -0.639179 | -1.451654 | -2.429498 |
2022-01-07 | -1.080324 | -0.100905 | 0.771496 | -0.461846 |
2022-01-08 | -2.394378 | -0.029706 | -0.059790 | 0.271669 |
In [48]:
df.sort_values(by="A", ascending=False)
Out[48]:
A | B | C | D | |
---|---|---|---|---|
2022-01-02 | 1.409102 | -1.883717 | -0.228207 | 0.536107 |
2022-01-03 | 1.067096 | 0.835134 | 1.201175 | -1.649642 |
2022-01-04 | 0.968663 | -0.581464 | 0.295065 | -1.619879 |
2022-01-08 | 0.271669 | -0.059790 | -0.029706 | -2.394378 |
2022-01-05 | 0.239929 | -2.513251 | 1.112255 | -0.643571 |
2022-01-07 | -0.461846 | 0.771496 | -0.100905 | -1.080324 |
2022-01-01 | -0.631687 | 1.188095 | 1.338382 | -0.551332 |
2022-01-06 | -2.429498 | -1.451654 | -0.639179 | -2.947974 |
In [61]:
df['20220104':'20220106']
Out[61]:
A | B | C | D | |
---|---|---|---|---|
2022-01-04 | 0.968663 | -0.581464 | 0.295065 | -1.619879 |
2022-01-05 | 0.239929 | -2.513251 | 1.112255 | -0.643571 |
2022-01-06 | -2.429498 | -1.451654 | -0.639179 | -2.947974 |
In [68]:
df.loc[:, ['A', 'D']]
Out[68]:
A | D | |
---|---|---|
2022-01-01 | -0.631687 | -0.551332 |
2022-01-02 | 1.409102 | 0.536107 |
2022-01-03 | 1.067096 | -1.649642 |
2022-01-04 | 0.968663 | -1.619879 |
2022-01-05 | 0.239929 | -0.643571 |
2022-01-06 | -2.429498 | -2.947974 |
2022-01-07 | -0.461846 | -1.080324 |
2022-01-08 | 0.271669 | -2.394378 |
In [70]:
df.loc['20220104':'20220106',['A','B']]
Out[70]:
A | B | |
---|---|---|
2022-01-04 | 0.968663 | -0.581464 |
2022-01-05 | 0.239929 | -2.513251 |
2022-01-06 | -2.429498 | -1.451654 |
In [71]:
df.at[dates[1], 'A']
Out[71]:
1.4091021538454698
In [72]:
df[df.A > 0]
Out[72]:
A | B | C | D | |
---|---|---|---|---|
2022-01-02 | 1.409102 | -1.883717 | -0.228207 | 0.536107 |
2022-01-03 | 1.067096 | 0.835134 | 1.201175 | -1.649642 |
2022-01-04 | 0.968663 | -0.581464 | 0.295065 | -1.619879 |
2022-01-05 | 0.239929 | -2.513251 | 1.112255 | -0.643571 |
2022-01-08 | 0.271669 | -0.059790 | -0.029706 | -2.394378 |
In [74]:
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three', 'four','three']
df2
Out[74]:
A | B | C | D | E | |
---|---|---|---|---|---|
2022-01-01 | -0.631687 | 1.188095 | 1.338382 | -0.551332 | one |
2022-01-02 | 1.409102 | -1.883717 | -0.228207 | 0.536107 | one |
2022-01-03 | 1.067096 | 0.835134 | 1.201175 | -1.649642 | two |
2022-01-04 | 0.968663 | -0.581464 | 0.295065 | -1.619879 | three |
2022-01-05 | 0.239929 | -2.513251 | 1.112255 | -0.643571 | four |
2022-01-06 | -2.429498 | -1.451654 | -0.639179 | -2.947974 | three |
2022-01-07 | -0.461846 | 0.771496 | -0.100905 | -1.080324 | four |
2022-01-08 | 0.271669 | -0.059790 | -0.029706 | -2.394378 | three |
In [75]:
df2[df2['E'].isin(('two', 'four'))]
Out[75]:
A | B | C | D | E | |
---|---|---|---|---|---|
2022-01-03 | 1.067096 | 0.835134 | 1.201175 | -1.649642 | two |
2022-01-05 | 0.239929 | -2.513251 | 1.112255 | -0.643571 | four |
2022-01-07 | -0.461846 | 0.771496 | -0.100905 | -1.080324 | four |
In [78]:
df.at[dates[0],'A'] = 0
df
Out[78]:
A | B | C | D | |
---|---|---|---|---|
2022-01-01 | 0.000000 | 1.188095 | 1.338382 | -0.551332 |
2022-01-02 | 1.409102 | -1.883717 | -0.228207 | 0.536107 |
2022-01-03 | 1.067096 | 0.835134 | 1.201175 | -1.649642 |
2022-01-04 | 0.968663 | -0.581464 | 0.295065 | -1.619879 |
2022-01-05 | 0.239929 | -2.513251 | 1.112255 | -0.643571 |
2022-01-06 | -2.429498 | -1.451654 | -0.639179 | -2.947974 |
2022-01-07 | -0.461846 | 0.771496 | -0.100905 | -1.080324 |
2022-01-08 | 0.271669 | -0.059790 | -0.029706 | -2.394378 |
In [79]:
df.iat[0,1] = 0
df
Out[79]:
A | B | C | D | |
---|---|---|---|---|
2022-01-01 | 0.000000 | 0.000000 | 1.338382 | -0.551332 |
2022-01-02 | 1.409102 | -1.883717 | -0.228207 | 0.536107 |
2022-01-03 | 1.067096 | 0.835134 | 1.201175 | -1.649642 |
2022-01-04 | 0.968663 | -0.581464 | 0.295065 | -1.619879 |
2022-01-05 | 0.239929 | -2.513251 | 1.112255 | -0.643571 |
2022-01-06 | -2.429498 | -1.451654 | -0.639179 | -2.947974 |
2022-01-07 | -0.461846 | 0.771496 | -0.100905 | -1.080324 |
2022-01-08 | 0.271669 | -0.059790 | -0.029706 | -2.394378 |
In [80]:
df.iat[0,3] = 0
df
Out[80]:
A | B | C | D | |
---|---|---|---|---|
2022-01-01 | 0.000000 | 0.000000 | 1.338382 | 0.000000 |
2022-01-02 | 1.409102 | -1.883717 | -0.228207 | 0.536107 |
2022-01-03 | 1.067096 | 0.835134 | 1.201175 | -1.649642 |
2022-01-04 | 0.968663 | -0.581464 | 0.295065 | -1.619879 |
2022-01-05 | 0.239929 | -2.513251 | 1.112255 | -0.643571 |
2022-01-06 | -2.429498 | -1.451654 | -0.639179 | -2.947974 |
2022-01-07 | -0.461846 | 0.771496 | -0.100905 | -1.080324 |
2022-01-08 | 0.271669 | -0.059790 | -0.029706 | -2.394378 |
In [83]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20220102', periods=6))
df['F'] = s1
df
Out[83]:
A | B | C | D | F | |
---|---|---|---|---|---|
2022-01-01 | 0.000000 | 0.000000 | 1.338382 | 0.000000 | NaN |
2022-01-02 | 1.409102 | -1.883717 | -0.228207 | 0.536107 | 1.0 |
2022-01-03 | 1.067096 | 0.835134 | 1.201175 | -1.649642 | 2.0 |
2022-01-04 | 0.968663 | -0.581464 | 0.295065 | -1.619879 | 3.0 |
2022-01-05 | 0.239929 | -2.513251 | 1.112255 | -0.643571 | 4.0 |
2022-01-06 | -2.429498 | -1.451654 | -0.639179 | -2.947974 | 5.0 |
2022-01-07 | -0.461846 | 0.771496 | -0.100905 | -1.080324 | 6.0 |
2022-01-08 | 0.271669 | -0.059790 | -0.029706 | -2.394378 | NaN |
In [87]:
print(type(df.values[0, -1]), df.values[0, -1])
<class 'numpy.float64'> nan
In [89]:
df1 = df.copy()
df1.dropna(how='any')
Out[89]:
A | B | C | D | F | |
---|---|---|---|---|---|
2022-01-02 | 1.409102 | -1.883717 | -0.228207 | 0.536107 | 1.0 |
2022-01-03 | 1.067096 | 0.835134 | 1.201175 | -1.649642 | 2.0 |
2022-01-04 | 0.968663 | -0.581464 | 0.295065 | -1.619879 | 3.0 |
2022-01-05 | 0.239929 | -2.513251 | 1.112255 | -0.643571 | 4.0 |
2022-01-06 | -2.429498 | -1.451654 | -0.639179 | -2.947974 | 5.0 |
2022-01-07 | -0.461846 | 0.771496 | -0.100905 | -1.080324 | 6.0 |
In [90]:
df1.fillna(value=5)
Out[90]:
A | B | C | D | F | |
---|---|---|---|---|---|
2022-01-01 | 0.000000 | 0.000000 | 1.338382 | 0.000000 | 5.0 |
2022-01-02 | 1.409102 | -1.883717 | -0.228207 | 0.536107 | 1.0 |
2022-01-03 | 1.067096 | 0.835134 | 1.201175 | -1.649642 | 2.0 |
2022-01-04 | 0.968663 | -0.581464 | 0.295065 | -1.619879 | 3.0 |
2022-01-05 | 0.239929 | -2.513251 | 1.112255 | -0.643571 | 4.0 |
2022-01-06 | -2.429498 | -1.451654 | -0.639179 | -2.947974 | 5.0 |
2022-01-07 | -0.461846 | 0.771496 | -0.100905 | -1.080324 | 6.0 |
2022-01-08 | 0.271669 | -0.059790 | -0.029706 | -2.394378 | 5.0 |
In [91]:
pd.isna(df1)
Out[91]:
A | B | C | D | F | |
---|---|---|---|---|---|
2022-01-01 | False | False | False | False | True |
2022-01-02 | False | False | False | False | False |
2022-01-03 | False | False | False | False | False |
2022-01-04 | False | False | False | False | False |
2022-01-05 | False | False | False | False | False |
2022-01-06 | False | False | False | False | False |
2022-01-07 | False | False | False | False | False |
2022-01-08 | False | False | False | False | True |
In [99]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates[:6]).shift(2)
s
Out[99]:
2022-01-01 NaN 2022-01-02 NaN 2022-01-03 1.0 2022-01-04 3.0 2022-01-05 5.0 2022-01-06 NaN Freq: D, dtype: float64
In [97]:
s
Out[97]:
2022-01-01 NaN 2022-01-02 NaN 2022-01-03 1.0 2022-01-04 3.0 2022-01-05 5.0 2022-01-06 NaN Freq: D, dtype: float64
In [96]:
df.add(s, axis='index')
Out[96]:
A | B | C | D | F | |
---|---|---|---|---|---|
2022-01-01 | NaN | NaN | NaN | NaN | NaN |
2022-01-02 | NaN | NaN | NaN | NaN | NaN |
2022-01-03 | 2.067096 | 1.835134 | 2.201175 | -0.649642 | 3.0 |
2022-01-04 | 3.968663 | 2.418536 | 3.295065 | 1.380121 | 6.0 |
2022-01-05 | 5.239929 | 2.486749 | 6.112255 | 4.356429 | 9.0 |
2022-01-06 | NaN | NaN | NaN | NaN | NaN |
2022-01-07 | NaN | NaN | NaN | NaN | NaN |
2022-01-08 | NaN | NaN | NaN | NaN | NaN |
In [103]:
df.apply(lambda x: x.max() - x.min())
Out[103]:
A 3.838600 B 3.348385 C 1.977561 D 3.484081 F 5.000000 dtype: float64
In [105]:
df.apply(lambda x: x.max() - x.min(), axis=1)
Out[105]:
2022-01-01 1.338382 2022-01-02 3.292820 2022-01-03 3.649642 2022-01-04 4.619879 2022-01-05 6.513251 2022-01-06 7.947974 2022-01-07 7.080324 2022-01-08 2.666047 Freq: D, dtype: float64
In [106]:
s = pd.Series(np.random.randint(0, 7, size=10))
s
Out[106]:
0 5 1 0 2 3 3 1 4 0 5 3 6 3 7 0 8 3 9 5 dtype: int32
In [110]:
s.value_counts(ascending=True)
Out[110]:
1 1 5 2 0 3 3 4 dtype: int64
In [3]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
df
Out[3]:
A | B | C | D | |
---|---|---|---|---|
0 | foo | one | -0.542044 | -0.780312 |
1 | bar | one | -1.685851 | 1.667171 |
2 | foo | two | -2.512135 | -1.103894 |
3 | bar | three | -0.250959 | -0.420705 |
4 | foo | two | -3.222366 | 1.776841 |
5 | bar | two | -1.228211 | -0.396067 |
6 | foo | one | 2.056563 | -0.819939 |
7 | foo | three | 0.819403 | 0.671113 |
In [114]:
df.groupby("A").sum()
Out[114]:
C | D | |
---|---|---|
A | ||
bar | 0.404722 | 0.857120 |
foo | -0.315457 | 1.005343 |
In [115]:
df.groupby(['A','B']).sum()
Out[115]:
C | D | ||
---|---|---|---|
A | B | ||
bar | one | 1.733757 | -0.444961 |
three | 0.222557 | 1.324174 | |
two | -1.551593 | -0.022093 | |
foo | one | -0.076205 | -0.837857 |
three | -0.645545 | 0.798104 | |
two | 0.406294 | 1.045096 |
In [116]:
df.to_csv('foo.csv')
In [118]:
pd.read_csv('foo.csv')
Out[118]:
Unnamed: 0 | A | B | C | D | |
---|---|---|---|---|---|
0 | 0 | foo | one | -0.545937 | -0.334651 |
1 | 1 | bar | one | 1.733757 | -0.444961 |
2 | 2 | foo | two | -0.006299 | 0.113370 |
3 | 3 | bar | three | 0.222557 | 1.324174 |
4 | 4 | foo | two | 0.412593 | 0.931726 |
5 | 5 | bar | two | -1.551593 | -0.022093 |
6 | 6 | foo | one | 0.469732 | -0.503207 |
7 | 7 | foo | three | -0.645545 | 0.798104 |
In [4]:
df.to_hdf('foo.h5', 'df')
In [5]:
pd.read_hdf('foo.h5', 'df')
Out[5]:
A | B | C | D | |
---|---|---|---|---|
0 | foo | one | -0.542044 | -0.780312 |
1 | bar | one | -1.685851 | 1.667171 |
2 | foo | two | -2.512135 | -1.103894 |
3 | bar | three | -0.250959 | -0.420705 |
4 | foo | two | -3.222366 | 1.776841 |
5 | bar | two | -1.228211 | -0.396067 |
6 | foo | one | 2.056563 | -0.819939 |
7 | foo | three | 0.819403 | 0.671113 |
In [9]:
pd.read_excel(r"C:\Users\Administrator\Desktop\雪球内容爬虫需求2022.1.10.xlsx", 0, engine="openpyxl")
Out[9]:
统计时间 | 股吧内码 | 基金代码 | 实时排名 | 股票中文名 | |
---|---|---|---|---|---|
0 | 2022-01-04 13:10:00 | 2432 | SZ002432 | 1 | 九安医疗 |
1 | 2022-01-04 13:10:00 | 2256 | SZ002256 | 2 | 兆新股份 |
2 | 2022-01-04 13:10:00 | 2665 | SZ002665 | 3 | 首航高科 |
3 | 2022-01-04 13:10:00 | 2354 | SZ002354 | 4 | 天神娱乐 |
4 | 2022-01-04 13:10:00 | 2349 | SZ002349 | 5 | 精华制药 |
... | ... | ... | ... | ... | ... |
495 | 2022-01-04 13:10:00 | 600654 | SH600654 | 496 | ST中安 |
496 | 2022-01-04 13:10:00 | 2082 | SZ002082 | 497 | 万邦德 |
497 | 2022-01-04 13:10:00 | 300025 | SZ300025 | 498 | 华星创业 |
498 | 2022-01-04 13:10:00 | 2017 | SZ002017 | 499 | 东信和平 |
499 | 2022-01-04 13:10:00 | 300267 | SZ300267 | 500 | 尔康制药 |
500 rows × 5 columns