pandas tutorial
阅读原文时间:2023年07月12日阅读:1

目录

我们可以看到,如果我们不指定dtype, 那么其会自行推断

data = np.array(['a', 'b', 'c', 'd'])
s = pd.Series(data, index=np.arange(100, 104))
s


100    a
101    b
102    c
103    d
dtype: object

利用dict来创建series

data = {'a':0, "b":1, 'c':2}
s = pd.Series(data)
s


a    0
b    1
c    2
dtype: int64


s.index


Index(['a', 'b', 'c'], dtype='object')


s = pd.Series(data, index=['b', 'c', 'd', 'a'])
s


b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

利用标量创建series

s = pd.Series(5, index=np.arange(5, 9))
s


5    5
6    5
7    5
8    5
dtype: int64

s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
s


a    1
b    2
c    3
d    4
e    5
dtype: int64


s[0], s[1], s[2]


(1, 2, 3)


s[:2], s[2:]


(a    1
 b    2
 dtype: int64, c    3
 d    4
 e    5
 dtype: int64)


s[-3:]


c    3
d    4
e    5
dtype: int64


s['a'], s['b'], s['c']


(1, 2, 3)


s[['a', 'b', 'e']]


a    1
b    2
e    5
dtype: int64

pandas.DataFrame(data, index, columns, dtype, copy)

df = pd.DataFrame()
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

data = [1, 2, 3, 4, 5]
df = pd.DataFrame(data)
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

0

1

1

2

2

3

3

4

4

5

data = [['Alex', 10], ['Bob', 12], ['Clarke', 13]]
df = pd.DataFrame(data, columns=['Name', 'Age'])
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

Name

Age

0

Alex

10

1

Bob

12

2

Clarke

13

利用dict创建dataframe

data = {'Name':['Alex', 'Bob', 'Clarke'], 'Age':[10., 12., 13.]}
df = pd.DataFrame(data)
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

Name

Age

0

Alex

10.0

1

Bob

12.0

2

Clarke

13.0

data = {'Name':['Alex', 'Bob', 'Clarke'], 'Age':[10., 12., 'NaN']}  #长度需要匹配
df = pd.DataFrame(data, index=['rank1', 'rank2', 'rank3'])
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

Name

Age

rank1

Alex

10

rank2

Bob

12

rank3

Clarke

NaN

data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}] #长度无需匹配
df = pd.DataFrame(data)
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

a

b

c

0

1

2

NaN

1

5

10

20.0

df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b'])
df2 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b1'])
print(df1)
print(df2)


        a   b
first   1   2
second  5  10
        a  b1
first   1 NaN
second  5 NaN


data = {
    'one': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
    'two': pd.Series([1, 2, 3, 4.], index=['a', 'b', 'c', 'd'])
}
df = pd.DataFrame(data)
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

a

1.0

1.0

b

2.0

2.0

c

3.0

3.0

d

NaN

4.0

选择

df['one']


a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

添加列

df['three'] = pd.Series([10, 20, 30])


df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

1.0

1.0

NaN

b

2.0

2.0

NaN

c

3.0

3.0

NaN

d

NaN

4.0

NaN

df['three'] = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

1.0

1.0

10.0

b

2.0

2.0

20.0

c

3.0

3.0

30.0

d

NaN

4.0

NaN

df['four'] = df['one'] + df['two']
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

four

a

1.0

1.0

10.0

2.0

b

2.0

2.0

20.0

4.0

c

3.0

3.0

30.0

6.0

d

NaN

4.0

NaN

NaN

列移除

del df['one']
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

two

three

four

a

1.0

10.0

2.0

b

2.0

20.0

4.0

c

3.0

30.0

6.0

d

4.0

NaN

NaN

df.pop('three')
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

two

four

a

1.0

2.0

b

2.0

4.0

c

3.0

6.0

d

4.0

NaN

行的选择, 添加, 移除

data = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(data)
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

a

1.0

1

b

2.0

2

c

3.0

3

d

NaN

4

df.loc['b']  # row b


one    2.0
two    2.0
Name: b, dtype: float64


df.iloc[1] # 按照0, 1, 2...的顺序选择


one    2.0
two    2.0
Name: b, dtype: float64


df[2:4]  #df[0]是错的

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

c

3.0

3

d

NaN

4

df = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])
df = df.append(df2)  #通过append可以在数据框后面添加数据,但是需要注意的这个操作并不会改变数据本身而是返回一个副本
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

a

b

0

1

2

1

3

4

0

5

6

1

7

8

df.drop(1)  #利用drop可以依照index来删除某些行,比如1,即把index=1的行均移除, 同样也是返回一个副本

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

a

b

0

1

2

0

5

6

pandas.Panel(data, items, major_axis, minor_axis, dtype, copy)

items: axis=0

major_axis: axis=1

minor_axis: axis=2

data = np.random.rand(2, 4, 5)
data


array([[[0.13766405, 0.31453832, 0.51876265, 0.97380794, 0.28314695],
        [0.02942928, 0.28957222, 0.38716041, 0.67941481, 0.54108452],
        [0.84420857, 0.60339649, 0.49242029, 0.34838561, 0.91342058],
        [0.1127622 , 0.28420695, 0.22687715, 0.06842055, 0.87414373]],

       [[0.07591772, 0.86028356, 0.30468089, 0.15491769, 0.04969857],
        [0.31649918, 0.85154403, 0.73062637, 0.99916418, 0.3809675 ],
        [0.63817574, 0.81089715, 0.41390597, 0.6660661 , 0.91651907],
        [0.24497635, 0.43923643, 0.01833888, 0.98348271, 0.89717517]]])


p = pd.Panel(data)
p


C:\Ana\lib\site-packages\IPython\core\interactiveshell.py:3267: FutureWarning:
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  exec(code_obj, self.user_global_ns, self.user_ns)

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 4 (major_axis) x 5 (minor_axis)
Items axis: 0 to 1
Major_axis axis: 0 to 3
Minor_axis axis: 0 to 4


data = {'Item1' : pd.DataFrame(np.random.randn(4, 3)),
   'Item2' : pd.DataFrame(np.random.randn(4, 2))}
p = pd.Panel(data)
p


C:\Ana\lib\site-packages\IPython\core\interactiveshell.py:3267: FutureWarning:
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  exec(code_obj, self.user_global_ns, self.user_ns)

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 4 (major_axis) x 3 (minor_axis)
Items axis: Item1 to Item2
Major_axis axis: 0 to 3
Minor_axis axis: 0 to 2


p['Item1']

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

2

0

1.796552

1.614647

-2.199413

1

-1.213886

-1.438678

-1.045931

2

-2.178608

1.212732

0.526674

3

-0.360727

-0.135351

0.678293

p.major_xs(0)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

Item1

Item2

0

1.796552

0.845528

1

1.614647

-0.708260

2

-2.199413

NaN

p.minor_xs(0)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

Item1

Item2

0

1.796552

0.845528

1

-1.213886

0.555775

2

-2.178608

0.925129

3

-0.360727

-0.380906

p.major_axis, p.minor_axis


(RangeIndex(start=0, stop=4, step=1), RangeIndex(start=0, stop=3, step=1))

Series

s = pd.Series(np.random.rand(5), index=['a', 'b', 'c', 'd', 'e'])
s


a    0.795298
b    0.141144
c    0.125098
d    0.965541
e    0.957783
dtype: float64


s.axes  #返回index


[Index(['a', 'b', 'c', 'd', 'e'], dtype='object')]


s.dtype


dtype('float64')


s.empty  #是否为空


False


s.ndim


1


s.size


5


s.values  #以ndarray的形式返回数值


array([0.79529816, 0.14114367, 0.12509848, 0.96554135, 0.95778323])


s.head(3)  #返回前n个


a    0.795298
b    0.141144
c    0.125098
dtype: float64


s.tail(3)  #返回后n个


c    0.125098
d    0.965541
e    0.957783
dtype: float64

DataFrame

data = {"a":[1, 2, 3], 'b':['r', 'g', 'b']}
df = pd.DataFrame(data, index=['first', 'second', "third"])
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

a

b

first

1

r

second

2

g

third

3

b

df.T  #转置

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

first

second

third

a

1

2

3

b

r

g

b

df.axes  #有俩个Index


[Index(['first', 'second', 'third'], dtype='object'),
 Index(['a', 'b'], dtype='object')]


df.dtypes


a     int64
b    object
dtype: object


df.empty


False


df.ndim


2


df.shape


(3, 2)


df.size  #3 x 2


6


df.values  #以ndarray 的形式返回


array([[1, 'r'],
       [2, 'g'],
       [3, 'b']], dtype=object)


df.head(2)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

a

b

first

1

r

second

2

g

df.tail(2)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

a

b

second

2

g

third

3

b

#Create a Dictionary of series
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
   'Lee','David','Gasper','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])
}

#Create a DataFrame
df = pd.DataFrame(d)
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

Name

Age

Rating

0

Tom

25

4.23

1

James

26

3.24

2

Ricky

25

3.98

3

Vin

23

2.56

4

Steve

30

3.20

5

Smith

29

4.60

6

Jack

23

3.80

7

Lee

34

3.78

8

David

40

2.98

9

Gasper

30

4.80

10

Betina

51

4.10

11

Andres

46

3.65

df.sum()  #默认是按行相加


Name      TomJamesRickyVinSteveSmithJackLeeDavidGasperBe...
Age                                                     382
Rating                                                44.92
dtype: object


df.sum()[1]


382


df.sum(1) #按列相加 居然自动避开了字符串...


0     29.23
1     29.24
2     28.98
3     25.56
4     33.20
5     33.60
6     26.80
7     37.78
8     42.98
9     34.80
10    55.10
11    49.65
dtype: float64


df.mean()


Age       31.833333
Rating     3.743333
dtype: float64


df.mean()['Age']


31.833333333333332


df.std()


Age       9.232682
Rating    0.661628
dtype: float64

1 count() Number of non-null observations

2 sum() Sum of values

3 mean() Mean of Values

4 median() Median of Values

5 mode() Mode of values

6 std() Standard Deviation of the Values

7 min() Minimum Value

8 max() Maximum Value

9 abs() Absolute Value

10 prod() Product of Values

11 cumsum() Cumulative Sum

12 cumprod() Cumulative Product

df.describe()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

Age

Rating

count

12.000000

12.000000

mean

31.833333

3.743333

std

9.232682

0.661628

min

23.000000

2.560000

25%

25.000000

3.230000

50%

29.500000

3.790000

75%

35.500000

4.132500

max

51.000000

4.800000

df.describe(include=['object'])

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

Name

count

12

unique

12

top

Gasper

freq

1

object: 针对字符列

number: 针对数字列

all: 针对全部

df.describe(include='all') #注意'all'不要以列表的形式传入, 否则会报错

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

Name

Age

Rating

count

12

12.000000

12.000000

unique

12

NaN

NaN

top

Gasper

NaN

NaN

freq

1

NaN

NaN

mean

NaN

31.833333

3.743333

std

NaN

9.232682

0.661628

min

NaN

23.000000

2.560000

25%

NaN

25.000000

3.230000

50%

NaN

29.500000

3.790000

75%

NaN

35.500000

4.132500

max

NaN

51.000000

4.800000

df.describe(include='object')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

Name

count

12

unique

12

top

Gasper

freq

1

pipe(func, …) -- 作用于整个表格

apply(func, 0) -- 作用于列或者行

applymap(func) -- 作用于每个元素

df = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

col1

col2

col3

0

1.363701

0.533478

1.850151

1

0.541553

-1.190178

1.204944

2

0.181793

-0.199892

-0.602374

3

-0.411247

1.978019

1.183671

4

-0.045223

1.444328

-0.121690

def adder(ele1,ele2):
    print("****")
    print(ele1)
    print("****")
    print(ele2)
    return ele1+ele2


df.pipe(adder, 2)  #可以看出,ele1 == df, ele2 == 2


****
       col1      col2      col3
0  1.363701  0.533478  1.850151
1  0.541553 -1.190178  1.204944
2  0.181793 -0.199892 -0.602374
3 -0.411247  1.978019  1.183671
4 -0.045223  1.444328 -0.121690
****
2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

col1

col2

col3

0

3.363701

2.533478

3.850151

1

2.541553

0.809822

3.204944

2

2.181793

1.800108

1.397626

3

1.588753

3.978019

3.183671

4

1.954777

3.444328

1.878310

df2 = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
df2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

col1

col2

col3

0

0.386211

1.297222

-0.413626

1

-1.873829

-0.007802

-0.857307

2

-0.881874

-2.026235

0.540769

3

0.458257

-0.590630

0.685780

4

0.177258

-1.843835

0.131939

def inf_print(x):
    print(type(x))
    print("*******")
    print(x)


df2.apply(inf_print, 0)


<class 'pandas.core.series.Series'>
*******
0    0.386211
1   -1.873829
2   -0.881874
3    0.458257
4    0.177258
Name: col1, dtype: float64
<class 'pandas.core.series.Series'>
*******
0    1.297222
1   -0.007802
2   -2.026235
3   -0.590630
4   -1.843835
Name: col2, dtype: float64
<class 'pandas.core.series.Series'>
*******
0   -0.413626
1   -0.857307
2    0.540769
3    0.685780
4    0.131939
Name: col3, dtype: float64

col1    None
col2    None
col3    None
dtype: object


df2.apply(inf_print, 1)


<class 'pandas.core.series.Series'>
*******
col1    0.386211
col2    1.297222
col3   -0.413626
Name: 0, dtype: float64
<class 'pandas.core.series.Series'>
*******
col1   -1.873829
col2   -0.007802
col3   -0.857307
Name: 1, dtype: float64
<class 'pandas.core.series.Series'>
*******
col1   -0.881874
col2   -2.026235
col3    0.540769
Name: 2, dtype: float64
<class 'pandas.core.series.Series'>
*******
col1    0.458257
col2   -0.590630
col3    0.685780
Name: 3, dtype: float64
<class 'pandas.core.series.Series'>
*******
col1    0.177258
col2   -1.843835
col3    0.131939
Name: 4, dtype: float64

0    None
1    None
2    None
3    None
4    None
dtype: object


df2.apply(np.mean)  #可以知道, apply会将列(默认,如果第二个参数为1则为行)一次次传入


col1   -0.346795
col2   -0.634256
col3    0.017511
dtype: float64


df2.applymap(lambda x: x*100)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

col1

col2

col3

0

38.621115

129.722168

-41.362553

1

-187.382897

-0.780203

-85.730689

2

-88.187450

-202.623474

54.076880

3

45.825709

-59.062993

68.578010

4

17.725785

-184.383525

13.193880

N=20

df = pd.DataFrame({
   'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
   'x': np.linspace(0,stop=N-1,num=N),
   'y': np.random.rand(N),
   'C': np.random.choice(['Low','Medium','High'],N).tolist(),
   'D': np.random.normal(100, 10, size=(N)).tolist()
})
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

x

y

C

D

0

2016-01-01

0.0

0.173488

High

117.632385

1

2016-01-02

1.0

0.493186

Low

114.066702

2

2016-01-03

2.0

0.982273

High

102.389228

3

2016-01-04

3.0

0.329518

Low

104.405035

4

2016-01-05

4.0

0.392182

Medium

85.867100

5

2016-01-06

5.0

0.905708

High

103.248690

6

2016-01-07

6.0

0.731801

Low

100.177698

7

2016-01-08

7.0

0.772975

High

97.365013

8

2016-01-09

8.0

0.953258

Low

90.228303

9

2016-01-10

9.0

0.503579

High

99.946431

10

2016-01-11

10.0

0.580698

Low

88.411279

11

2016-01-12

11.0

0.268562

High

91.238630

12

2016-01-13

12.0

0.462713

High

86.720994

13

2016-01-14

13.0

0.482387

High

104.549789

14

2016-01-15

14.0

0.963168

Medium

108.565120

15

2016-01-16

15.0

0.692654

High

112.370992

16

2016-01-17

16.0

0.716956

High

112.949463

17

2016-01-18

17.0

0.897878

Low

107.860172

18

2016-01-19

18.0

0.289202

Medium

90.430672

19

2016-01-20

19.0

0.957986

High

115.225753

df.reindex(index=(0, 2, 5), columns=['A', 'C', 'B'])  #没有B所以会补

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

C

B

0

2016-01-01

High

NaN

2

2016-01-03

High

NaN

5

2016-01-06

High

NaN

reindex_like

df1 = pd.DataFrame(np.random.randn(4, 4))
df2 = pd.DataFrame(np.random.randn(3, 3))
df1

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

2

3

0

-2.515820

-0.027034

-0.695420

0.368491

1

-1.055241

0.778208

-1.062983

-1.715173

2

0.178253

-0.186661

0.615827

1.379872

3

-1.316952

-0.209785

-0.953194

-0.138620

df2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

2

0

-0.913434

1.641339

-2.418425

1

0.113041

0.721168

0.446690

2

2.606504

-0.972984

-2.588228

df1.reindex_like(df2)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

2

0

-2.515820

-0.027034

-0.695420

1

-1.055241

0.778208

-1.062983

2

0.178253

-0.186661

0.615827

pad/ffill - 用前面的值填充

bfill/backfill - 用后面的值填充

nearest - 用最近的值填充

df2.reindex_like(df1, method="ffill")

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

2

3

0

-0.913434

1.641339

-2.418425

-2.418425

1

0.113041

0.721168

0.446690

0.446690

2

2.606504

-0.972984

-2.588228

-2.588228

3

2.606504

-0.972984

-2.588228

-2.588228

df2.reindex_like(df1, method="bfill")

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

2

3

0

-0.913434

1.641339

-2.418425

NaN

1

0.113041

0.721168

0.446690

NaN

2

2.606504

-0.972984

-2.588228

NaN

3

NaN

NaN

NaN

NaN

df2.reindex_like(df1, method="nearest")

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

2

3

0

-0.913434

1.641339

-2.418425

-2.418425

1

0.113041

0.721168

0.446690

0.446690

2

2.606504

-0.972984

-2.588228

-2.588228

3

2.606504

-0.972984

-2.588228

-2.588228

limit

df1 = pd.DataFrame(np.random.randn(7, 7))
df2 = pd.DataFrame(np.random.randn(3, 3))
df1, df2


(          0         1         2         3         4         5         6
 0  0.592257  0.913287  1.276314  0.064212  1.338661 -0.110666 -0.459020
 1 -0.104347  0.388397 -1.822243  1.927027  0.890738  0.577283 -0.302798
 2 -0.016216 -1.101383  0.128118 -0.138639  1.642480 -1.382323 -0.835393
 3  1.411169 -0.395379 -0.412377  0.661016 -0.602245  0.558017  0.588833
 4  0.609378  0.338787 -0.858829  0.006657  1.509428 -0.283262 -0.563293
 5 -1.316789  0.152338 -1.027535  0.026238 -0.052540  1.233837 -1.028193
 6  0.992425  1.364755 -1.384109 -1.888707 -0.259932 -0.207928  0.135734,
           0         1         2
 0 -0.297591  1.019611  0.892070
 1  0.881763 -0.498356  1.708343
 2  0.123616  0.875709  0.387768)


df2.reindex_like(df1, method="ffill", limit=1)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

2

3

4

5

6

0

-0.297591

1.019611

0.892070

0.892070

NaN

NaN

NaN

1

0.881763

-0.498356

1.708343

1.708343

NaN

NaN

NaN

2

0.123616

0.875709

0.387768

0.387768

NaN

NaN

NaN

3

0.123616

0.875709

0.387768

0.387768

NaN

NaN

NaN

4

NaN

NaN

NaN

NaN

NaN

NaN

NaN

5

NaN

NaN

NaN

NaN

NaN

NaN

NaN

6

NaN

NaN

NaN

NaN

NaN

NaN

NaN

df2.reindex_like(df1, method="ffill", limit=2)  #可以发现,limit限制了填补的最大数量

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

2

3

4

5

6

0

-0.297591

1.019611

0.892070

0.892070

0.892070

NaN

NaN

1

0.881763

-0.498356

1.708343

1.708343

1.708343

NaN

NaN

2

0.123616

0.875709

0.387768

0.387768

0.387768

NaN

NaN

3

0.123616

0.875709

0.387768

0.387768

0.387768

NaN

NaN

4

0.123616

0.875709

0.387768

0.387768

0.387768

NaN

NaN

5

NaN

NaN

NaN

NaN

NaN

NaN

NaN

6

NaN

NaN

NaN

NaN

NaN

NaN

NaN

df2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

2

0

-0.297591

1.019611

0.892070

1

0.881763

-0.498356

1.708343

2

0.123616

0.875709

0.387768

df2.rename(columns={0:'A', 1:'B', 2:'C', 3:'D'},
          index={0:'a', 1:'b', 2:'c', 3:'d'})

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

C

a

-0.297591

1.019611

0.892070

b

0.881763

-0.498356

1.708343

c

0.123616

0.875709

0.387768

df2  #注意返回的是一个副本

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

2

0

-0.297591

1.019611

0.892070

1

0.881763

-0.498356

1.708343

2

0.123616

0.875709

0.387768

N=20
df = pd.DataFrame({
   'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
   'x': np.linspace(0,stop=N-1,num=N),
   'y': np.random.rand(N),
   'C': np.random.choice(['Low','Medium','High'],N).tolist(),
   'D': np.random.normal(100, 10, size=(N)).tolist()
   })
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

x

y

C

D

0

2016-01-01

0.0

0.529153

Low

110.430517

1

2016-01-02

1.0

0.713513

Low

125.401221

2

2016-01-03

2.0

0.751809

Medium

112.446846

3

2016-01-04

3.0

0.124047

High

108.633343

4

2016-01-05

4.0

0.472205

Medium

102.750572

5

2016-01-06

5.0

0.221076

High

108.208930

6

2016-01-07

6.0

0.231904

High

104.982321

7

2016-01-08

7.0

0.567697

Medium

117.178737

8

2016-01-09

8.0

0.384391

Medium

94.160408

9

2016-01-10

9.0

0.109675

Medium

108.560830

10

2016-01-11

10.0

0.681480

High

101.400936

11

2016-01-12

11.0

0.918687

Medium

102.421124

12

2016-01-13

12.0

0.332227

High

99.464727

13

2016-01-14

13.0

0.373779

High

107.219963

14

2016-01-15

14.0

0.412173

Low

97.184597

15

2016-01-16

15.0

0.194842

Medium

96.671218

16

2016-01-17

16.0

0.372288

Low

105.270272

17

2016-01-18

17.0

0.068876

Low

101.112631

18

2016-01-19

18.0

0.391142

High

102.240937

19

2016-01-20

19.0

0.942600

Low

92.492350

for col in df:
    print(col)


A
x
y
C
D

iteritems() (key, value)

for key, value in df.iteritems():
    print(key)
    print("*******")
    print(value)
    print("#######")


A
*******
0    2016-01-01
1    2016-01-02
2    2016-01-03
3    2016-01-04
4    2016-01-05
5    2016-01-06
6    2016-01-07
7    2016-01-08
8    2016-01-09
9    2016-01-10
10   2016-01-11
11   2016-01-12
12   2016-01-13
13   2016-01-14
14   2016-01-15
15   2016-01-16
16   2016-01-17
17   2016-01-18
18   2016-01-19
19   2016-01-20
Name: A, dtype: datetime64[ns]
#######
x
*******
0      0.0
1      1.0
2      2.0
3      3.0
4      4.0
5      5.0
6      6.0
7      7.0
8      8.0
9      9.0
10    10.0
11    11.0
12    12.0
13    13.0
14    14.0
15    15.0
16    16.0
17    17.0
18    18.0
19    19.0
Name: x, dtype: float64
#######
y
*******
0     0.529153
1     0.713513
2     0.751809
3     0.124047
4     0.472205
5     0.221076
6     0.231904
7     0.567697
8     0.384391
9     0.109675
10    0.681480
11    0.918687
12    0.332227
13    0.373779
14    0.412173
15    0.194842
16    0.372288
17    0.068876
18    0.391142
19    0.942600
Name: y, dtype: float64
#######
C
*******
0        Low
1        Low
2     Medium
3       High
4     Medium
5       High
6       High
7     Medium
8     Medium
9     Medium
10      High
11    Medium
12      High
13      High
14       Low
15    Medium
16       Low
17       Low
18      High
19       Low
Name: C, dtype: object
#######
D
*******
0     110.430517
1     125.401221
2     112.446846
3     108.633343
4     102.750572
5     108.208930
6     104.982321
7     117.178737
8      94.160408
9     108.560830
10    101.400936
11    102.421124
12     99.464727
13    107.219963
14     97.184597
15     96.671218
16    105.270272
17    101.112631
18    102.240937
19     92.492350
Name: D, dtype: float64
#######

iterrows() (index, series)

for index, row in df.iterrows():
    print(index)
    print("********")
    print(row)
    print("########")


0
********
A    2016-01-01 00:00:00
x                      0
y               0.529153
C                    Low
D                110.431
Name: 0, dtype: object
########
1
********
A    2016-01-02 00:00:00
x                      1
y               0.713513
C                    Low
D                125.401
Name: 1, dtype: object
########
2
********
A    2016-01-03 00:00:00
x                      2
y               0.751809
C                 Medium
D                112.447
Name: 2, dtype: object
########
3
********
A    2016-01-04 00:00:00
x                      3
y               0.124047
C                   High
D                108.633
Name: 3, dtype: object
########
4
********
A    2016-01-05 00:00:00
x                      4
y               0.472205
C                 Medium
D                102.751
Name: 4, dtype: object
########
5
********
A    2016-01-06 00:00:00
x                      5
y               0.221076
C                   High
D                108.209
Name: 5, dtype: object
########
6
********
A    2016-01-07 00:00:00
x                      6
y               0.231904
C                   High
D                104.982
Name: 6, dtype: object
########
7
********
A    2016-01-08 00:00:00
x                      7
y               0.567697
C                 Medium
D                117.179
Name: 7, dtype: object
########
8
********
A    2016-01-09 00:00:00
x                      8
y               0.384391
C                 Medium
D                94.1604
Name: 8, dtype: object
########
9
********
A    2016-01-10 00:00:00
x                      9
y               0.109675
C                 Medium
D                108.561
Name: 9, dtype: object
########
10
********
A    2016-01-11 00:00:00
x                     10
y                0.68148
C                   High
D                101.401
Name: 10, dtype: object
########
11
********
A    2016-01-12 00:00:00
x                     11
y               0.918687
C                 Medium
D                102.421
Name: 11, dtype: object
########
12
********
A    2016-01-13 00:00:00
x                     12
y               0.332227
C                   High
D                99.4647
Name: 12, dtype: object
########
13
********
A    2016-01-14 00:00:00
x                     13
y               0.373779
C                   High
D                 107.22
Name: 13, dtype: object
########
14
********
A    2016-01-15 00:00:00
x                     14
y               0.412173
C                    Low
D                97.1846
Name: 14, dtype: object
########
15
********
A    2016-01-16 00:00:00
x                     15
y               0.194842
C                 Medium
D                96.6712
Name: 15, dtype: object
########
16
********
A    2016-01-17 00:00:00
x                     16
y               0.372288
C                    Low
D                 105.27
Name: 16, dtype: object
########
17
********
A    2016-01-18 00:00:00
x                     17
y              0.0688757
C                    Low
D                101.113
Name: 17, dtype: object
########
18
********
A    2016-01-19 00:00:00
x                     18
y               0.391142
C                   High
D                102.241
Name: 18, dtype: object
########
19
********
A    2016-01-20 00:00:00
x                     19
y                 0.9426
C                    Low
D                92.4924
Name: 19, dtype: object
########

itertuples()

for row in df.itertuples():
    print(row)
    print("*********")


Pandas(Index=0, A=Timestamp('2016-01-01 00:00:00'), x=0.0, y=0.5291527485322772, C='Low', D=110.43051702923863)
*********
Pandas(Index=1, A=Timestamp('2016-01-02 00:00:00'), x=1.0, y=0.713512538332376, C='Low', D=125.40122093094763)
*********
Pandas(Index=2, A=Timestamp('2016-01-03 00:00:00'), x=2.0, y=0.7518093449140011, C='Medium', D=112.44684623090683)
*********
Pandas(Index=3, A=Timestamp('2016-01-04 00:00:00'), x=3.0, y=0.12404682661025335, C='High', D=108.63334270085768)
*********
Pandas(Index=4, A=Timestamp('2016-01-05 00:00:00'), x=4.0, y=0.47220500135853094, C='Medium', D=102.75057211144569)
*********
Pandas(Index=5, A=Timestamp('2016-01-06 00:00:00'), x=5.0, y=0.22107632396965704, C='High', D=108.20892974035311)
*********
Pandas(Index=6, A=Timestamp('2016-01-07 00:00:00'), x=6.0, y=0.23190410081052582, C='High', D=104.98232144314449)
*********
Pandas(Index=7, A=Timestamp('2016-01-08 00:00:00'), x=7.0, y=0.5676969704991909, C='Medium', D=117.17873695254926)
*********
Pandas(Index=8, A=Timestamp('2016-01-09 00:00:00'), x=8.0, y=0.38439055971010483, C='Medium', D=94.16040790153708)
*********
Pandas(Index=9, A=Timestamp('2016-01-10 00:00:00'), x=9.0, y=0.10967465769586215, C='Medium', D=108.56083032097501)
*********
Pandas(Index=10, A=Timestamp('2016-01-11 00:00:00'), x=10.0, y=0.6814801929159177, C='High', D=101.40093570017285)
*********
Pandas(Index=11, A=Timestamp('2016-01-12 00:00:00'), x=11.0, y=0.9186874162117078, C='Medium', D=102.42112353899493)
*********
Pandas(Index=12, A=Timestamp('2016-01-13 00:00:00'), x=12.0, y=0.33222699128916544, C='High', D=99.46472715055548)
*********
Pandas(Index=13, A=Timestamp('2016-01-14 00:00:00'), x=13.0, y=0.37377940932622644, C='High', D=107.21996306704972)
*********
Pandas(Index=14, A=Timestamp('2016-01-15 00:00:00'), x=14.0, y=0.41217288447139533, C='Low', D=97.1845970026168)
*********
Pandas(Index=15, A=Timestamp('2016-01-16 00:00:00'), x=15.0, y=0.19484179666549728, C='Medium', D=96.67121785562782)
*********
Pandas(Index=16, A=Timestamp('2016-01-17 00:00:00'), x=16.0, y=0.3722882537710307, C='Low', D=105.27027217632694)
*********
Pandas(Index=17, A=Timestamp('2016-01-18 00:00:00'), x=17.0, y=0.068875657049556, C='Low', D=101.11263086450178)
*********
Pandas(Index=18, A=Timestamp('2016-01-19 00:00:00'), x=18.0, y=0.3911420688006072, C='High', D=102.24093699498466)
*********
Pandas(Index=19, A=Timestamp('2016-01-20 00:00:00'), x=19.0, y=0.9425996619637542, C='Low', D=92.49235045195462)
*********

教程上说,迭代的东西是一个副本,所以对其中的元素进行更改是不会影响原数据的.

sort_index

unsorted_df=pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns=['col2','col1'])
unsorted_df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

col2

col1

1

0.418578

-0.556598

4

0.513646

1.436592

6

0.830816

1.500456

2

-0.373790

-0.578432

3

0.961146

0.991754

5

0.826093

-0.345533

9

0.881435

0.934766

8

-1.388952

-1.276708

0

-0.685924

-0.210499

7

1.556807

-0.652186

unsorted_df.sort_index(ascending=False) #通过label排序

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

col2

col1

9

0.881435

0.934766

8

-1.388952

-1.276708

7

1.556807

-0.652186

6

0.830816

1.500456

5

0.826093

-0.345533

4

0.513646

1.436592

3

0.961146

0.991754

2

-0.373790

-0.578432

1

0.418578

-0.556598

0

-0.685924

-0.210499

help(unsorted_df.sort_index)


Help on method sort_index in module pandas.core.frame:

sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, by=None) method of pandas.core.frame.DataFrame instance
    Sort object by labels (along an axis)

    Parameters
    ----------
    axis : index, columns to direct sorting
    level : int or level name or list of ints or list of level names
        if not None, sort on values in specified index level(s)
    ascending : boolean, default True
        Sort ascending vs. descending
    inplace : bool, default False
        if True, perform operation in-place
    kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
         Choice of sorting algorithm. See also ndarray.np.sort for more
         information.  `mergesort` is the only stable algorithm. For
         DataFrames, this option is only applied when sorting on a single
         column or label.
    na_position : {'first', 'last'}, default 'last'
         `first` puts NaNs at the beginning, `last` puts NaNs at the end.
         Not implemented for MultiIndex.
    sort_remaining : bool, default True
        if true and sorting by level and index is multilevel, sort by other
        levels too (in order) after sorting by specified level

    Returns
    -------
    sorted_obj : DataFrame


unsorted_df.sort_index(axis=1)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

col1

col2

1

-0.556598

0.418578

4

1.436592

0.513646

6

1.500456

0.830816

2

-0.578432

-0.373790

3

0.991754

0.961146

5

-0.345533

0.826093

9

0.934766

0.881435

8

-1.276708

-1.388952

0

-0.210499

-0.685924

7

-0.652186

1.556807

sort_value

unsorted_df = pd.DataFrame({'col1':[2,1,1,1],'col2':[1,3,2,4]})
unsorted_df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

col1

col2

0

2

1

1

1

3

2

1

2

3

1

4

unsorted_df.sort_values(by="col1")

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

col1

col2

1

1

3

2

1

2

3

1

4

0

2

1

help(unsorted_df.sort_values)


Help on method sort_values in module pandas.core.frame:

sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last') method of pandas.core.frame.DataFrame instance
    Sort by the values along either axis

    Parameters
    ----------
    by : str or list of str
        Name or list of names to sort by.

        - if `axis` is 0 or `'index'` then `by` may contain index
          levels and/or column labels
        - if `axis` is 1 or `'columns'` then `by` may contain column
          levels and/or index labels

        .. versionchanged:: 0.23.0
           Allow specifying index or column level names.
    axis : {0 or 'index', 1 or 'columns'}, default 0
         Axis to be sorted
    ascending : bool or list of bool, default True
         Sort ascending vs. descending. Specify list for multiple sort
         orders.  If this is a list of bools, must match the length of
         the by.
    inplace : bool, default False
         if True, perform operation in-place
    kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
         Choice of sorting algorithm. See also ndarray.np.sort for more
         information.  `mergesort` is the only stable algorithm. For
         DataFrames, this option is only applied when sorting on a single
         column or label.
    na_position : {'first', 'last'}, default 'last'
         `first` puts NaNs at the beginning, `last` puts NaNs at the end

    Returns
    -------
    sorted_obj : DataFrame

    Examples
    --------
    >>> df = pd.DataFrame({
    ...     'col1' : ['A', 'A', 'B', np.nan, 'D', 'C'],
    ...     'col2' : [2, 1, 9, 8, 7, 4],
    ...     'col3': [0, 1, 9, 4, 2, 3],
    ... })
    >>> df
        col1 col2 col3
    0   A    2    0
    1   A    1    1
    2   B    9    9
    3   NaN  8    4
    4   D    7    2
    5   C    4    3

    Sort by col1

    >>> df.sort_values(by=['col1'])
        col1 col2 col3
    0   A    2    0
    1   A    1    1
    2   B    9    9
    5   C    4    3
    4   D    7    2
    3   NaN  8    4

    Sort by multiple columns

    >>> df.sort_values(by=['col1', 'col2'])
        col1 col2 col3
    1   A    1    1
    0   A    2    0
    2   B    9    9
    5   C    4    3
    4   D    7    2
    3   NaN  8    4

    Sort Descending

    >>> df.sort_values(by='col1', ascending=False)
        col1 col2 col3
    4   D    7    2
    5   C    4    3
    2   B    9    9
    0   A    2    0
    1   A    1    1
    3   NaN  8    4

    Putting NAs first

    >>> df.sort_values(by='col1', ascending=False, na_position='first')
        col1 col2 col3
    3   NaN  8    4
    4   D    7    2
    5   C    4    3
    2   B    9    9
    0   A    2    0
    1   A    1    1


df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

C

D

a

-0.235229

0.214813

0.838116

1.081632

b

0.530365

0.600021

0.753903

-1.958886

c

1.760542

-1.027882

-0.053263

0.299710

d

-0.241942

0.455707

-0.684968

-0.513217

e

0.866758

1.035051

-0.451651

-0.987964

f

1.620520

0.236408

0.478373

-1.012238

g

-0.236978

0.352751

-0.514737

-0.195936

h

-0.046064

0.129530

-0.874676

1.740141

df.loc[:, 'A']


a    2.539530
b   -0.278140
c    1.291831
d   -0.231592
e   -2.047005
f   -0.720743
g   -0.995131
h    0.190029
Name: A, dtype: float64


df.loc[:, ['A', 'C']]

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

C

a

2.539530

-0.290170

b

-0.278140

1.575699

c

1.291831

0.038547

d

-0.231592

0.117562

e

-2.047005

-0.569768

f

-0.720743

0.321223

g

-0.995131

1.530757

h

0.190029

-0.068202

df.loc['a':'h']

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

C

D

a

2.539530

0.046380

-0.290170

-1.540302

b

-0.278140

1.420046

1.575699

0.533353

c

1.291831

2.595299

0.038547

-0.488134

d

-0.231592

-0.162497

0.117562

1.452291

e

-2.047005

-0.046110

-0.569768

1.328672

f

-0.720743

0.339251

0.321223

-0.310041

g

-0.995131

0.831769

1.530757

0.975214

h

0.190029

1.056606

-0.068202

-1.127776

df.loc[df.loc[:, 'A'] > 0, 'A']


b    0.530365
c    1.760542
e    0.866758
f    1.620520
Name: A, dtype: float64

iloc()

df.iloc[:4]

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

C

D

a

2.539530

0.046380

-0.290170

-1.540302

b

-0.278140

1.420046

1.575699

0.533353

c

1.291831

2.595299

0.038547

-0.488134

d

-0.231592

-0.162497

0.117562

1.452291

df.iloc[1:5, 2:4]

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

C

D

b

1.575699

0.533353

c

0.038547

-0.488134

d

0.117562

1.452291

e

-0.569768

1.328672

import collections
p = collections.defaultdict(int)
p['A'] += 1
p['B'] += 1
p


defaultdict(int, {'A': 1, 'B': 1})


df = pd.DataFrame({'thing':['A', 'A', 'B', 'A', 'B', 'A', 'C', 'C', 'C']})
for row in df.loc[df.loc[:, 'thing']== 'D'].iterrows():
    print(1)


x = set([1, 2, 3, 1])
x


{1, 2, 3}


df2 = df.copy()
df2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

D

a

-0.235229

1.081632

b

0.530365

-1.958886

c

1.760542

0.299710

d

-0.241942

-0.513217

e

0.866758

-0.987964

f

1.620520

-1.012238

g

-0.236978

-0.195936

h

-0.046064

1.740141

df2.loc['a'][0] = 3


df3 = df['A']
df3


a   -0.235229
b    0.530365
c    1.760542
d   -0.241942
e    0.866758
f    1.620520
g   -0.236978
h   -0.046064
Name: A, dtype: float64


df3['a'] = 3
df3


a    3.000000
b    0.530365
c    1.760542
d   -0.241942
e    0.866758
f    1.620520
g   -0.236978
h   -0.046064
Name: A, dtype: float64


df3 = df.iloc[:4]
df3

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

D

a

3.000000

1.081632

b

0.530365

-1.958886

c

1.760542

0.299710

d

-0.241942

-0.513217

df3.pop('A')


a    0.000000
b    0.530365
c    1.760542
d   -0.241942
Name: A, dtype: float64


d = {1:1, 2:2}
u = {3:3}
d.update(u)
d


{1: 1, 2: 2, 3: 3}


s = pd.Series(['Tom', 'William Rick', 'John', 'Alber@t', np.nan, '1234','SteveSmith'])
s


0             Tom
1    William Rick
2            John
3         Alber@t
4             NaN
5            1234
6      SteveSmith
dtype: object

s.str.lower() s.str.upper()

list(s.str)


[0      T
 1      W
 2      J
 3      A
 4    NaN
 5      1
 6      S
 dtype: object, 0      o
 1      i
 2      o
 3      l
 4    NaN
 5      2
 6      t
 dtype: object, 0      m
 1      l
 2      h
 3      b
 4    NaN
 5      3
 6      e
 dtype: object, 0    NaN
 1      l
 2      n
 3      e
 4    NaN
 5      4
 6      v
 dtype: object, 0    NaN
 1      i
 2    NaN
 3      r
 4    NaN
 5    NaN
 6      e
 dtype: object, 0    NaN
 1      a
 2    NaN
 3      @
 4    NaN
 5    NaN
 6      S
 dtype: object, 0    NaN
 1      m
 2    NaN
 3      t
 4    NaN
 5    NaN
 6      m
 dtype: object, 0    NaN
 1
 2    NaN
 3    NaN
 4    NaN
 5    NaN
 6      i
 dtype: object, 0    NaN
 1      R
 2    NaN
 3    NaN
 4    NaN
 5    NaN
 6      t
 dtype: object, 0    NaN
 1      i
 2    NaN
 3    NaN
 4    NaN
 5    NaN
 6      h
 dtype: object, 0    NaN
 1      c
 2    NaN
 3    NaN
 4    NaN
 5    NaN
 6    NaN
 dtype: object, 0    NaN
 1      k
 2    NaN
 3    NaN
 4    NaN
 5    NaN
 6    NaN
 dtype: object]


s.str.lower()  #所以这一步实际上就是把s.str的第一个部分全部改为小写?


0             tom
1    william rick
2            john
3         alber@t
4             NaN
5            1234
6      stevesmith
dtype: object


s.str.upper()


0             TOM
1    WILLIAM RICK
2            JOHN
3         ALBER@T
4             NaN
5            1234
6      STEVESMITH
dtype: object

s.str.len()

s.str.len()  #实际上就是把每一个元素的长度弄出来


0     3.0
1    12.0
2     4.0
3     7.0
4     NaN
5     4.0
6    10.0
dtype: float64

s.str.strip()

s = pd.Series(['Tom            ', ' William Rick', 'John', 'Alber@t'])
s


0    Tom
1       William Rick
2               John
3            Alber@t
dtype: object


s.str.strip()


0             Tom
1    William Rick
2            John
3         Alber@t
dtype: object


s.str.strip('k')


0    Tom
1        William Ric
2               John
3            Alber@t
dtype: object

s.str.spilt()

s.str.split('o')


0    [T, m            ]
1       [ William Rick]
2               [J, hn]
3             [Alber@t]
dtype: object


s.str.split()


0              [Tom]
1    [William, Rick]
2             [John]
3          [Alber@t]
dtype: object


s.str.split(' ')


0    [Tom, , , , , , , , , , , , ]
1                [, William, Rick]
2                           [John]
3                        [Alber@t]
dtype: object

s.str.cat()

s.str.cat()


'Tom             William RickJohnAlber@t'


s.str.cat(sep='A')


'Tom            A William RickAJohnAAlber@t'


s.str.cat(sep='_____')


'Tom            _____ William Rick_____John_____Alber@t'


help(s.str.cat) #所以如果不是series之间相连接,需要通过关键词sep来传入分隔符


Help on method cat in module pandas.core.strings:

cat(others=None, sep=None, na_rep=None, join=None) method of pandas.core.strings.StringMethods instance
    Concatenate strings in the Series/Index with given separator.

    If `others` is specified, this function concatenates the Series/Index
    and elements of `others` element-wise.
    If `others` is not passed, then all values in the Series/Index are
    concatenated into a single string with a given `sep`.

    Parameters
    ----------
    others : Series, Index, DataFrame, np.ndarrary or list-like
        Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and
        other list-likes of strings must have the same length as the
        calling Series/Index, with the exception of indexed objects (i.e.
        Series/Index/DataFrame) if `join` is not None.

        If others is a list-like that contains a combination of Series,
        np.ndarray (1-dim) or list-like, then all elements will be unpacked
        and must satisfy the above criteria individually.

        If others is None, the method returns the concatenation of all
        strings in the calling Series/Index.
    sep : string or None, default None
        If None, concatenates without any separator.
    na_rep : string or None, default None
        Representation that is inserted for all missing values:

        - If `na_rep` is None, and `others` is None, missing values in the
          Series/Index are omitted from the result.
        - If `na_rep` is None, and `others` is not None, a row containing a
          missing value in any of the columns (before concatenation) will
          have a missing value in the result.
    join : {'left', 'right', 'outer', 'inner'}, default None
        Determines the join-style between the calling Series/Index and any
        Series/Index/DataFrame in `others` (objects without an index need
        to match the length of the calling Series/Index). If None,
        alignment is disabled, but this option will be removed in a future
        version of pandas and replaced with a default of `'left'`. To
        disable alignment, use `.values` on any Series/Index/DataFrame in
        `others`.

        .. versionadded:: 0.23.0

    Returns
    -------
    concat : str or Series/Index of objects
        If `others` is None, `str` is returned, otherwise a `Series/Index`
        (same type as caller) of objects is returned.

    See Also
    --------
    split : Split each string in the Series/Index

    Examples
    --------
    When not passing `others`, all values are concatenated into a single
    string:

    >>> s = pd.Series(['a', 'b', np.nan, 'd'])
    >>> s.str.cat(sep=' ')
    'a b d'

    By default, NA values in the Series are ignored. Using `na_rep`, they
    can be given a representation:

    >>> s.str.cat(sep=' ', na_rep='?')
    'a b ? d'

    If `others` is specified, corresponding values are concatenated with
    the separator. Result will be a Series of strings.

    >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
    0    a,A
    1    b,B
    2    NaN
    3    d,D
    dtype: object

    Missing values will remain missing in the result, but can again be
    represented using `na_rep`

    >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')
    0    a,A
    1    b,B
    2    -,C
    3    d,D
    dtype: object

    If `sep` is not specified, the values are concatenated without
    separation.

    >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')
    0    aA
    1    bB
    2    -C
    3    dD
    dtype: object

    Series with different indexes can be aligned before concatenation. The
    `join`-keyword works as in other methods.

    >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
    >>> s.str.cat(t, join=None, na_rep='-')
    0    ad
    1    ba
    2    -e
    3    dc
    dtype: object
    >>>
    >>> s.str.cat(t, join='left', na_rep='-')
    0    aa
    1    b-
    2    -c
    3    dd
    dtype: object
    >>>
    >>> s.str.cat(t, join='outer', na_rep='-')
    0    aa
    1    b-
    2    -c
    3    dd
    4    -e
    dtype: object
    >>>
    >>> s.str.cat(t, join='inner', na_rep='-')
    0    aa
    2    -c
    3    dd
    dtype: object
    >>>
    >>> s.str.cat(t, join='right', na_rep='-')
    3    dd
    0    aa
    4    -e
    2    -c
    dtype: object

    For more examples, see :ref:`here <text.concatenate>`.

s.str.get_dummies()

s.str.get_dummies()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

William Rick

Alber@t

John

Tom

0

0

0

0

1

1

1

0

0

0

2

0

0

1

0

3

0

1

0

0

s = pd.Series(['Tom ', ' William Rick', 'Alber@t', 'John'])
s.str.get_dummies()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

William Rick

Alber@t

John

Tom

0

0

0

0

1

1

1

0

0

0

2

0

1

0

0

3

0

0

1

0

所以是以DataFrame的形式来表示各个字符出现的顺序?

s.str.contains()

s.str.contains(' ')


0     True
1     True
2    False
3    False
dtype: bool


s.str.contains('o')


0     True
1    False
2    False
3     True
dtype: bool

s.str.replace()

s.str.replace('@', '$')


0             Tom
1     William Rick
2          Alber$t
3             John
dtype: object

s.str.repeat()

s = pd.Series(['Tom ', ' William Rick', 'Alber@t', 'John', np.nan])


s.str.repeat(3)


0                               Tom Tom Tom
1     William Rick William Rick William Rick
2                      Alber@tAlber@tAlber@t
3                               JohnJohnJohn
dtype: object


s.str.repeat([1, 2, 3, 4])


0                          Tom
1     William Rick William Rick
2         Alber@tAlber@tAlber@t
3              JohnJohnJohnJohn
dtype: object

s.str.count(s2)

统计s2在每个元素中出现的次数(不一定要相等,被包含也是可以的)

s.str.count('m')


0    1
1    1
2    0
3    0
dtype: int64

s.str.startswith()

s.str.startswith('To')


0     True
1    False
2    False
3    False
4      NaN
dtype: object


s.str.startswith('To', na=False)


0     True
1    False
2    False
3    False
4    False
dtype: bool

s.str.find()

s.str.find('e')  # -1表示不存在, 其它的数字表示其位置


0   -1.0
1   -1.0
2    3.0
3   -1.0
4    NaN
dtype: float64

s.str.findall()

s.str.findall('e')  #?这样感觉还不如上面的好用啊


0     []
1     []
2    [e]
3     []
4    NaN
dtype: object

s.str.islower()

s.str.islower()


0    False
1    False
2    False
3    False
4      NaN
dtype: object

s.str.isupper()

s.str.isupper()


0    False
1    False
2    False
3    False
4      NaN
dtype: object

s.str.isnumeric()

s.str.isnumeric()


0    False
1    False
2    False
3    False
4      NaN
dtype: object

get_option()

set_option()

reset_option()

describe_option()

option_context()

display.max_rows

display.max_columns

display.expand_frame_repr

display.max_colwidth

display.precision #精确度

get_option()

pd.get_option("display.max_rows")  #能够显示的最大行数


60


pd.get_option("display.max_columns") #能够显示的最大列数


20


pd.set_option("display.max_rows", 10)  #设置能够显示的最大行数为10
pd.get_option("display.max_rows")


10


s = pd.Series(np.arange(11))
s


0      0
1      1
2      2
3      3
4      4
      ..
6      6
7      7
8      8
9      9
10    10
Length: 11, dtype: int32


pd.set_option("display.max_columns", 2) #设置能够显示最大列数为2
pd.get_option("display.max_columns")


2


data = {1:np.arange(10), 2:np.arange(1, 11), 3:np.arange(2, 12)}
df = pd.DataFrame(data)
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

1

3

0

0

2

1

1

3

2

2

4

3

3

5

4

4

6

5

5

7

6

6

8

7

7

9

8

8

10

9

9

11

10 rows × 3 columns

reset_option()

reset_option 接受一个参数,设置其属性为默认的属性

pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")


s


0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
dtype: int32

describe_option()

打印对参数的说明

pd.describe_option("display.max_rows")


display.max_rows : int
    If max_rows is exceeded, switch to truncate view. Depending on
    `large_repr`, objects are either centrally truncated or printed as
    a summary view. 'None' value means unlimited.

    In case python/IPython is running in a terminal and `large_repr`
    equals 'truncate' this can be set to 0 and pandas will auto-detect
    the height of the terminal and print a truncated object which fits
    the screen height. The IPython notebook, IPython qtconsole, or
    IDLE do not run in a terminal and hence it is not possible to do
    correct auto-detection.
    [default: 60] [currently: 60]

option_context()

with pd.option_context("display.max_rows", 10):
    print(pd.get_option("display.max_rows"))
print(pd.get_option("display.max_rows"))


10
60

所以其实这就是一个上下文,在上下文管理器中,参数会被暂时性地调整,离开控制之后,便会回到原先的状态

percent_change pct_change

比较每一个元素与其之前的元素的变化率

s = pd.Series([1,2,3,4,5,4])
s.pct_change()


0         NaN
1    1.000000
2    0.500000
3    0.333333
4    0.250000
5   -0.200000
dtype: float64


df = pd.DataFrame(np.random.randn(5, 2))
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

0

0.855450

-0.673131

1

0.610321

0.389186

2

0.386450

-0.209481

3

-0.159426

1.941561

4

0.692407

0.332914

df.pct_change()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

0

1

0

NaN

NaN

1

-0.286549

-1.578173

2

-0.366809

-1.538254

3

-1.412541

-10.268438

4

-5.343109

-0.828533

help(df.pct_change)


Help on method pct_change in module pandas.core.generic:

pct_change(periods=1, fill_method='pad', limit=None, freq=None, **kwargs) method of pandas.core.frame.DataFrame instance
    Percentage change between the current and a prior element.

    Computes the percentage change from the immediately previous row by
    default. This is useful in comparing the percentage of change in a time
    series of elements.

    Parameters
    ----------
    periods : int, default 1
        Periods to shift for forming percent change.
    fill_method : str, default 'pad'
        How to handle NAs before computing percent changes.
    limit : int, default None
        The number of consecutive NAs to fill before stopping.
    freq : DateOffset, timedelta, or offset alias string, optional
        Increment to use from time series API (e.g. 'M' or BDay()).
    **kwargs
        Additional keyword arguments are passed into
        `DataFrame.shift` or `Series.shift`.

    Returns
    -------
    chg : Series or DataFrame
        The same type as the calling object.

    See Also
    --------
    Series.diff : Compute the difference of two elements in a Series.
    DataFrame.diff : Compute the difference of two elements in a DataFrame.
    Series.shift : Shift the index by some number of periods.
    DataFrame.shift : Shift the index by some number of periods.

    Examples
    --------
    **Series**

    >>> s = pd.Series([90, 91, 85])
    >>> s
    0    90
    1    91
    2    85
    dtype: int64

    >>> s.pct_change()
    0         NaN
    1    0.011111
    2   -0.065934
    dtype: float64

    >>> s.pct_change(periods=2)
    0         NaN
    1         NaN
    2   -0.055556
    dtype: float64

    See the percentage change in a Series where filling NAs with last
    valid observation forward to next valid.

    >>> s = pd.Series([90, 91, None, 85])
    >>> s
    0    90.0
    1    91.0
    2     NaN
    3    85.0
    dtype: float64

    >>> s.pct_change(fill_method='ffill')
    0         NaN
    1    0.011111
    2    0.000000
    3   -0.065934
    dtype: float64

    **DataFrame**

    Percentage change in French franc, Deutsche Mark, and Italian lira from
    1980-01-01 to 1980-03-01.

    >>> df = pd.DataFrame({
    ...     'FR': [4.0405, 4.0963, 4.3149],
    ...     'GR': [1.7246, 1.7482, 1.8519],
    ...     'IT': [804.74, 810.01, 860.13]},
    ...     index=['1980-01-01', '1980-02-01', '1980-03-01'])
    >>> df
                    FR      GR      IT
    1980-01-01  4.0405  1.7246  804.74
    1980-02-01  4.0963  1.7482  810.01
    1980-03-01  4.3149  1.8519  860.13

    >>> df.pct_change()
                      FR        GR        IT
    1980-01-01       NaN       NaN       NaN
    1980-02-01  0.013810  0.013684  0.006549
    1980-03-01  0.053365  0.059318  0.061876

    Percentage of change in GOOG and APPL stock volume. Shows computing
    the percentage change between columns.

    >>> df = pd.DataFrame({
    ...     '2016': [1769950, 30586265],
    ...     '2015': [1500923, 40912316],
    ...     '2014': [1371819, 41403351]},
    ...     index=['GOOG', 'APPL'])
    >>> df
              2016      2015      2014
    GOOG   1769950   1500923   1371819
    APPL  30586265  40912316  41403351

    >>> df.pct_change(axis='columns')
          2016      2015      2014
    GOOG   NaN -0.151997 -0.086016
    APPL   NaN  0.337604  0.012002


s.pct_change(2)


0         NaN
1         NaN
2    2.000000
3    1.000000
4    0.666667
5    0.000000
dtype: float64

covariance

计算协方差,会自动省略NA

s1 = pd.Series(np.random.randn(10))
s2 = pd.Series(np.random.randn(10))
s1.cov(s2)


0.05172017259428779

当cov作用于DataFrame的时候,会计算列之间的协方差

frame = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
frame.cov()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

a

b

c

d

e

a

1.737924

0.114498

0.279058

-0.325639

-0.224395

b

0.114498

0.543846

0.421225

-0.103112

-0.373280

c

0.279058

0.421225

1.326316

-0.635491

-0.573974

d

-0.325639

-0.103112

-0.635491

0.978824

0.764530

e

-0.224395

-0.373280

-0.573974

0.764530

0.856515

frame['a'].cov(frame.loc[:, 'b'])


0.11449787431144376

corrlation

计算相关系数

frame = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
frame.corr()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

a

b

c

d

e

a

1.000000

0.312138

0.173514

0.247459

-0.389265

b

0.312138

1.000000

-0.410371

0.104755

-0.480116

c

0.173514

-0.410371

1.000000

0.124817

-0.230226

d

0.247459

0.104755

0.124817

1.000000

0.259208

e

-0.389265

-0.480116

-0.230226

0.259208

1.000000

frame['a'].corr(frame.iloc[:, 1])


0.3121378592734573

Ranking

排序

s = pd.Series(np.random.randn(5), index=list('abcde'))
s['d'] = s['b'] # so there's a tie
s


a   -2.179226
b    0.614786
c    1.801039
d    0.614786
e   -1.604162
dtype: float64


s.rank()


a    1.0
b    3.5
c    5.0
d    3.5
e    2.0
dtype: float64

.rolling()

df = pd.DataFrame(np.random.randn(10, 4),
   index = pd.date_range('1/1/2000', periods=10),
   columns = ['A', 'B', 'C', 'D'])
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

C

D

2000-01-01

-0.602802

0.294063

-0.803316

-0.500838

2000-01-02

-0.968835

-1.745470

0.027664

-1.012092

2000-01-03

-0.047073

0.440166

-0.338257

1.551372

2000-01-04

0.136861

0.357544

-0.370691

-0.312876

2000-01-05

1.257872

-1.126768

-0.539122

-0.478309

2000-01-06

-0.954518

-0.067380

0.139257

-0.908213

2000-01-07

1.501658

-1.189674

0.794113

-0.155611

2000-01-08

0.400153

0.291841

-0.450429

1.044665

2000-01-09

-0.797415

0.346594

-0.107653

-0.605027

2000-01-10

-0.532034

1.296260

0.303357

-0.056933

df.rolling(window=3).mean()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

C

D

2000-01-01

NaN

NaN

NaN

NaN

2000-01-02

NaN

NaN

NaN

NaN

2000-01-03

-0.539570

-0.337081

-0.371303

0.012814

2000-01-04

-0.293015

-0.315920

-0.227095

0.075468

2000-01-05

0.449220

-0.109686

-0.416023

0.253396

2000-01-06

0.146739

-0.278868

-0.256852

-0.566466

2000-01-07

0.601671

-0.794607

0.131416

-0.514044

2000-01-08

0.315764

-0.321738

0.160980

-0.006386

2000-01-09

0.368132

-0.183746

0.078677

0.094676

2000-01-10

-0.309765

0.644898

-0.084908

0.127569

注意到, window=3, 所以每次的作用于是3行,后面跟的函数是mean, 所以第n行的结果是n, n-1, n-2三行的平均值,前俩行自然而然是NaN

.expanding()

df.expanding(min_periods=3).mean()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

C

D

2000-01-01

NaN

NaN

NaN

NaN

2000-01-02

NaN

NaN

NaN

NaN

2000-01-03

-0.539570

-0.337081

-0.371303

0.012814

2000-01-04

-0.370462

-0.163425

-0.371150

-0.068609

2000-01-05

-0.044795

-0.356093

-0.404744

-0.150549

2000-01-06

-0.196416

-0.307974

-0.314078

-0.276826

2000-01-07

0.046166

-0.433932

-0.155765

-0.259510

2000-01-08

0.090415

-0.343210

-0.192598

-0.096488

2000-01-09

-0.008233

-0.266565

-0.183159

-0.152992

2000-01-10

-0.060613

-0.110283

-0.134508

-0.143386

df['A'][:4].mean()


-0.37046210746336367

注意到 2000-01-03是一样的,后面的就不一样了,这是因为, min_periods=3限制了作用域最小为3行,所以n=4的时候,实际上是会把前面的4行取平均

.ewm()

滑动平均

实现方式

df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

C

D

2000-01-01

-0.602802

0.294063

-0.803316

-0.500838

2000-01-02

-0.968835

-1.745470

0.027664

-1.012092

2000-01-03

-0.047073

0.440166

-0.338257

1.551372

2000-01-04

0.136861

0.357544

-0.370691

-0.312876

2000-01-05

1.257872

-1.126768

-0.539122

-0.478309

2000-01-06

-0.954518

-0.067380

0.139257

-0.908213

2000-01-07

1.501658

-1.189674

0.794113

-0.155611

2000-01-08

0.400153

0.291841

-0.450429

1.044665

2000-01-09

-0.797415

0.346594

-0.107653

-0.605027

2000-01-10

-0.532034

1.296260

0.303357

-0.056933

df.ewm(com=0.5, adjust=True).mean()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

C

D

2000-01-01

-0.602802

0.294063

-0.803316

-0.500838

2000-01-02

-0.877327

-1.235587

-0.180081

-0.884278

2000-01-03

-0.302535

-0.075451

-0.289588

0.801941

2000-01-04

-0.005943

0.216821

-0.344332

0.049439

2000-01-05

0.840082

-0.682606

-0.474729

-0.303847

2000-01-06

-0.357961

-0.271892

-0.064843

-0.707311

2000-01-07

0.882352

-0.884027

0.508056

-0.339343

2000-01-08

0.560837

-0.099995

-0.131032

0.583470

2000-01-09

-0.344710

0.197746

-0.115445

-0.208901

2000-01-10

-0.469595

0.930101

0.163761

-0.107587

a = 1 / (1+0.5)
x1 = df.iloc[0, 0]
x2 = df.iloc[1, 0]
(x2 + (1-a) * x1) / (1 + 1 - a)


-0.8773265770527067


df.mean()


A   -0.060613
B   -0.110283
C   -0.134508
D   -0.143386
dtype: float64


df = pd.DataFrame(np.random.randn(10, 4),
   index = pd.date_range('1/1/2000', periods=10),
   columns = ['A', 'B', 'C', 'D'])
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

C

D

2000-01-01

0.371455

-0.191824

-0.146096

-1.347259

2000-01-02

-1.571376

0.061927

0.149302

-0.507093

2000-01-03

0.015607

1.637870

-0.642065

-0.228584

2000-01-04

-0.236157

0.366852

-0.117198

1.373123

2000-01-05

-0.390561

-0.670603

-2.022454

0.964826

2000-01-06

-0.309272

1.234031

-0.383297

0.234326

2000-01-07

-0.925264

0.417228

-0.432956

-1.331263

2000-01-08

0.223505

0.160549

-0.247965

0.262888

2000-01-09

-2.442173

0.757845

-0.704929

0.037361

2000-01-10

-0.936853

-0.479592

-0.274561

0.146732

r = df.rolling(window=3, min_periods=1)
r


Rolling [window=3,min_periods=1,center=False,axis=0]


r.aggregate(np.sum)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

C

D

2000-01-01

0.371455

-0.191824

-0.146096

-1.347259

2000-01-02

-1.199921

-0.129897

0.003207

-1.854353

2000-01-03

-1.184314

1.507972

-0.638858

-2.082937

2000-01-04

-1.791925

2.066648

-0.609961

0.637446

2000-01-05

-0.611110

1.334119

-2.781717

2.109366

2000-01-06

-0.935989

0.930280

-2.522949

2.572276

2000-01-07

-1.625096

0.980656

-2.838707

-0.132111

2000-01-08

-1.011030

1.811808

-1.064218

-0.834049

2000-01-09

-3.143932

1.335622

-1.385850

-1.031014

2000-01-10

-3.155521

0.438802

-1.227455

0.446981

r['A'].aggregate(np.sum)


2000-01-01    0.371455
2000-01-02   -1.199921
2000-01-03   -1.184314
2000-01-04   -1.791925
2000-01-05   -0.611110
2000-01-06   -0.935989
2000-01-07   -1.625096
2000-01-08   -1.011030
2000-01-09   -3.143932
2000-01-10   -3.155521
Freq: D, Name: A, dtype: float64


r[['A', 'B']].aggregate(np.sum)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

2000-01-01

0.371455

-0.191824

2000-01-02

-1.199921

-0.129897

2000-01-03

-1.184314

1.507972

2000-01-04

-1.791925

2.066648

2000-01-05

-0.611110

1.334119

2000-01-06

-0.935989

0.930280

2000-01-07

-1.625096

0.980656

2000-01-08

-1.011030

1.811808

2000-01-09

-3.143932

1.335622

2000-01-10

-3.155521

0.438802

r['A'].aggregate([np.sum, np.mean])

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

sum

mean

2000-01-01

0.371455

0.371455

2000-01-02

-1.199921

-0.599960

2000-01-03

-1.184314

-0.394771

2000-01-04

-1.791925

-0.597308

2000-01-05

-0.611110

-0.203703

2000-01-06

-0.935989

-0.311996

2000-01-07

-1.625096

-0.541699

2000-01-08

-1.011030

-0.337010

2000-01-09

-3.143932

-1.047977

2000-01-10

-3.155521

-1.051840

r.aggregate({'A':np.sum, 'B':np.mean})

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

2000-01-01

0.371455

-0.191824

2000-01-02

-1.199921

-0.064949

2000-01-03

-1.184314

0.502657

2000-01-04

-1.791925

0.688883

2000-01-05

-0.611110

0.444706

2000-01-06

-0.935989

0.310093

2000-01-07

-1.625096

0.326885

2000-01-08

-1.011030

0.603936

2000-01-09

-3.143932

0.445207

2000-01-10

-3.155521

0.146267

df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

0.560200

0.244413

-1.814612

b

NaN

NaN

NaN

c

0.210847

0.014889

-0.711094

d

NaN

NaN

NaN

e

-0.340756

1.657751

0.419182

f

-0.699982

0.258028

1.324182

g

NaN

NaN

NaN

h

-1.271993

1.477846

0.488302

NaN: Not a Number

isnull() notnull()

df.isnull()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

False

False

False

b

True

True

True

c

False

False

False

d

True

True

True

e

False

False

False

f

False

False

False

g

True

True

True

h

False

False

False

df.notnull()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

True

True

True

b

False

False

False

c

True

True

True

d

False

False

False

e

True

True

True

f

True

True

True

g

False

False

False

h

True

True

True

关于缺失值的计算

当数据求和的时候,缺失值视为0,如果数据全为NA,那么结果也是NA, 额。。。好像还是0,修改了?

df['one'].sum()


-1.541684617991043


df = pd.DataFrame(index=[0,1,2,3,4,5],columns=['one','two'])
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

0

NaN

NaN

1

NaN

NaN

2

NaN

NaN

3

NaN

NaN

4

NaN

NaN

5

NaN

NaN

df['one'].sum()


0

清理,替换缺失值

df = pd.DataFrame(np.random.randn(3, 3), index=['a', 'c', 'e'],columns=['one',
'two', 'three'])

df = df.reindex(['a', 'b', 'c'])
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

0.326460

1.529225

1.230027

b

NaN

NaN

NaN

c

1.296313

0.032379

2.182915

df.fillna(0)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

0.326460

1.529225

1.230027

b

0.000000

0.000000

0.000000

c

1.296313

0.032379

2.182915

df  #看来上面是返回一个副本

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

0.326460

1.529225

1.230027

b

NaN

NaN

NaN

c

1.296313

0.032379

2.182915

df.fillna('haha')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

0.32646

1.52922

1.23003

b

haha

haha

haha

c

1.29631

0.0323788

2.18292

前向后向替换

df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

2.218769

-0.742408

-1.068846

b

NaN

NaN

NaN

c

0.467651

0.372357

1.387020

d

NaN

NaN

NaN

e

-0.868840

-0.648827

-2.261319

f

-0.755799

0.159130

-0.129401

g

NaN

NaN

NaN

h

0.703744

-1.665470

0.166229

df.fillna(method='pad')  # 或者 method='ffill'

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

2.218769

-0.742408

-1.068846

b

2.218769

-0.742408

-1.068846

c

0.467651

0.372357

1.387020

d

0.467651

0.372357

1.387020

e

-0.868840

-0.648827

-2.261319

f

-0.755799

0.159130

-0.129401

g

-0.755799

0.159130

-0.129401

h

0.703744

-1.665470

0.166229

可以看到,会用前面的元素来替换

df.fillna(method="backfill")  #或者  method='bfill'

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

2.218769

-0.742408

-1.068846

b

0.467651

0.372357

1.387020

c

0.467651

0.372357

1.387020

d

-0.868840

-0.648827

-2.261319

e

-0.868840

-0.648827

-2.261319

f

-0.755799

0.159130

-0.129401

g

0.703744

-1.665470

0.166229

h

0.703744

-1.665470

0.166229

help(df.fillna)


Help on method fillna in module pandas.core.frame:

fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs) method of pandas.core.frame.DataFrame instance
    Fill NA/NaN values using the specified method

    Parameters
    ----------
    value : scalar, dict, Series, or DataFrame
        Value to use to fill holes (e.g. 0), alternately a
        dict/Series/DataFrame of values specifying which value to use for
        each index (for a Series) or column (for a DataFrame). (values not
        in the dict/Series/DataFrame will not be filled). This value cannot
        be a list.
    method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
        Method to use for filling holes in reindexed Series
        pad / ffill: propagate last valid observation forward to next valid
        backfill / bfill: use NEXT valid observation to fill gap
    axis : {0 or 'index', 1 or 'columns'}
    inplace : boolean, default False
        If True, fill in place. Note: this will modify any
        other views on this object, (e.g. a no-copy slice for a column in a
        DataFrame).
    limit : int, default None
        If method is specified, this is the maximum number of consecutive
        NaN values to forward/backward fill. In other words, if there is
        a gap with more than this number of consecutive NaNs, it will only
        be partially filled. If method is not specified, this is the
        maximum number of entries along the entire axis where NaNs will be
        filled. Must be greater than 0 if not None.
    downcast : dict, default is None
        a dict of item->dtype of what to downcast if possible,
        or the string 'infer' which will try to downcast to an appropriate
        equal type (e.g. float64 to int64 if possible)

    See Also
    --------
    interpolate : Fill NaN values using interpolation.
    reindex, asfreq

    Returns
    -------
    filled : DataFrame

    Examples
    --------
    >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
    ...                    [3, 4, np.nan, 1],
    ...                    [np.nan, np.nan, np.nan, 5],
    ...                    [np.nan, 3, np.nan, 4]],
    ...                    columns=list('ABCD'))
    >>> df
         A    B   C  D
    0  NaN  2.0 NaN  0
    1  3.0  4.0 NaN  1
    2  NaN  NaN NaN  5
    3  NaN  3.0 NaN  4

    Replace all NaN elements with 0s.

    >>> df.fillna(0)
        A   B   C   D
    0   0.0 2.0 0.0 0
    1   3.0 4.0 0.0 1
    2   0.0 0.0 0.0 5
    3   0.0 3.0 0.0 4

    We can also propagate non-null values forward or backward.

    >>> df.fillna(method='ffill')
        A   B   C   D
    0   NaN 2.0 NaN 0
    1   3.0 4.0 NaN 1
    2   3.0 4.0 NaN 5
    3   3.0 3.0 NaN 4

    Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
    2, and 3 respectively.

    >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    >>> df.fillna(value=values)
        A   B   C   D
    0   0.0 2.0 2.0 0
    1   3.0 4.0 2.0 1
    2   0.0 1.0 2.0 5
    3   0.0 3.0 2.0 4

    Only replace the first NaN element.

    >>> df.fillna(value=values, limit=1)
        A   B   C   D
    0   0.0 2.0 2.0 0
    1   3.0 4.0 NaN 1
    2   NaN 1.0 NaN 5
    3   NaN 3.0 NaN 4

丢弃缺失数据

我们可以利用dropna来舍弃缺失数据所在的轴(默认为行)

df.dropna()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

a

2.218769

-0.742408

-1.068846

c

0.467651

0.372357

1.387020

e

-0.868840

-0.648827

-2.261319

f

-0.755799

0.159130

-0.129401

h

0.703744

-1.665470

0.166229

df = pd.DataFrame({'one':np.arange(10), 'two':np.arange(10), 'three':np.arange(10)})
df.iloc[1, 2] = np.nan
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

three

0

0

0

0.0

1

1

1

NaN

2

2

2

2.0

3

3

3

3.0

4

4

4

4.0

5

5

5

5.0

6

6

6

6.0

7

7

7

7.0

8

8

8

8.0

9

9

9

9.0

df.dropna(axis=1)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

0

0

0

1

1

1

2

2

2

3

3

3

4

4

4

5

5

5

6

6

6

7

7

7

8

8

8

9

9

9

df = pd.DataFrame({'one':[10,20,30,40,50,2000], 'two':[1000,0,30,40,50,60]})
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

0

10

1000

1

20

0

2

30

30

3

40

40

4

50

50

5

2000

60

df.replace({1000:10, 2000:66})

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

0

10

10

1

20

0

2

30

30

3

40

40

4

50

50

5

66

60

help(df.replace)


Help on method replace in module pandas.core.frame:

replace(to_replace=None, value=None, inplace=False, limit=None, regex=False, method='pad') method of pandas.core.frame.DataFrame instance
    Replace values given in `to_replace` with `value`.

    Values of the DataFrame are replaced with other values dynamically.
    This differs from updating with ``.loc`` or ``.iloc``, which require
    you to specify a location to update with some value.

    Parameters
    ----------
    to_replace : str, regex, list, dict, Series, int, float, or None
        How to find the values that will be replaced.

        * numeric, str or regex:

            - numeric: numeric values equal to `to_replace` will be
              replaced with `value`
            - str: string exactly matching `to_replace` will be replaced
              with `value`
            - regex: regexs matching `to_replace` will be replaced with
              `value`

        * list of str, regex, or numeric:

            - First, if `to_replace` and `value` are both lists, they
              **must** be the same length.
            - Second, if ``regex=True`` then all of the strings in **both**
              lists will be interpreted as regexs otherwise they will match
              directly. This doesn't matter much for `value` since there
              are only a few possible substitution regexes you can use.
            - str, regex and numeric rules apply as above.

        * dict:

            - Dicts can be used to specify different replacement values
              for different existing values. For example,
              ``{'a': 'b', 'y': 'z'}`` replaces the value 'a' with 'b' and
              'y' with 'z'. To use a dict in this way the `value`
              parameter should be `None`.
            - For a DataFrame a dict can specify that different values
              should be replaced in different columns. For example,
              ``{'a': 1, 'b': 'z'}`` looks for the value 1 in column 'a'
              and the value 'z' in column 'b' and replaces these values
              with whatever is specified in `value`. The `value` parameter
              should not be ``None`` in this case. You can treat this as a
              special case of passing two lists except that you are
              specifying the column to search in.
            - For a DataFrame nested dictionaries, e.g.,
              ``{'a': {'b': np.nan}}``, are read as follows: look in column
              'a' for the value 'b' and replace it with NaN. The `value`
              parameter should be ``None`` to use a nested dict in this
              way. You can nest regular expressions as well. Note that
              column names (the top-level dictionary keys in a nested
              dictionary) **cannot** be regular expressions.

        * None:

            - This means that the `regex` argument must be a string,
              compiled regular expression, or list, dict, ndarray or
              Series of such elements. If `value` is also ``None`` then
              this **must** be a nested dictionary or Series.

        See the examples section for examples of each of these.
    value : scalar, dict, list, str, regex, default None
        Value to replace any values matching `to_replace` with.
        For a DataFrame a dict of values can be used to specify which
        value to use for each column (columns not in the dict will not be
        filled). Regular expressions, strings and lists or dicts of such
        objects are also allowed.
    inplace : boolean, default False
        If True, in place. Note: this will modify any
        other views on this object (e.g. a column from a DataFrame).
        Returns the caller if this is True.
    limit : int, default None
        Maximum size gap to forward or backward fill.
    regex : bool or same types as `to_replace`, default False
        Whether to interpret `to_replace` and/or `value` as regular
        expressions. If this is ``True`` then `to_replace` *must* be a
        string. Alternatively, this could be a regular expression or a
        list, dict, or array of regular expressions in which case
        `to_replace` must be ``None``.
    method : {'pad', 'ffill', 'bfill', `None`}
        The method to use when for replacement, when `to_replace` is a
        scalar, list or tuple and `value` is ``None``.

        .. versionchanged:: 0.23.0
            Added to DataFrame.

    See Also
    --------
    DataFrame.fillna : Fill NA values
    DataFrame.where : Replace values based on boolean condition
    Series.str.replace : Simple string replacement.

    Returns
    -------
    DataFrame
        Object after replacement.

    Raises
    ------
    AssertionError
        * If `regex` is not a ``bool`` and `to_replace` is not
          ``None``.
    TypeError
        * If `to_replace` is a ``dict`` and `value` is not a ``list``,
          ``dict``, ``ndarray``, or ``Series``
        * If `to_replace` is ``None`` and `regex` is not compilable
          into a regular expression or is a list, dict, ndarray, or
          Series.
        * When replacing multiple ``bool`` or ``datetime64`` objects and
          the arguments to `to_replace` does not match the type of the
          value being replaced
    ValueError
        * If a ``list`` or an ``ndarray`` is passed to `to_replace` and
          `value` but they are not the same length.

    Notes
    -----
    * Regex substitution is performed under the hood with ``re.sub``. The
      rules for substitution for ``re.sub`` are the same.
    * Regular expressions will only substitute on strings, meaning you
      cannot provide, for example, a regular expression matching floating
      point numbers and expect the columns in your frame that have a
      numeric dtype to be matched. However, if those floating point
      numbers *are* strings, then you can do this.
    * This method has *a lot* of options. You are encouraged to experiment
      and play with this method to gain intuition about how it works.
    * When dict is used as the `to_replace` value, it is like
      key(s) in the dict are the to_replace part and
      value(s) in the dict are the value parameter.

    Examples
    --------

    **Scalar `to_replace` and `value`**

    >>> s = pd.Series([0, 1, 2, 3, 4])
    >>> s.replace(0, 5)
    0    5
    1    1
    2    2
    3    3
    4    4
    dtype: int64

    >>> df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
    ...                    'B': [5, 6, 7, 8, 9],
    ...                    'C': ['a', 'b', 'c', 'd', 'e']})
    >>> df.replace(0, 5)
       A  B  C
    0  5  5  a
    1  1  6  b
    2  2  7  c
    3  3  8  d
    4  4  9  e

    **List-like `to_replace`**

    >>> df.replace([0, 1, 2, 3], 4)
       A  B  C
    0  4  5  a
    1  4  6  b
    2  4  7  c
    3  4  8  d
    4  4  9  e

    >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1])
       A  B  C
    0  4  5  a
    1  3  6  b
    2  2  7  c
    3  1  8  d
    4  4  9  e

    >>> s.replace([1, 2], method='bfill')
    0    0
    1    3
    2    3
    3    3
    4    4
    dtype: int64

    **dict-like `to_replace`**

    >>> df.replace({0: 10, 1: 100})
         A  B  C
    0   10  5  a
    1  100  6  b
    2    2  7  c
    3    3  8  d
    4    4  9  e

    >>> df.replace({'A': 0, 'B': 5}, 100)
         A    B  C
    0  100  100  a
    1    1    6  b
    2    2    7  c
    3    3    8  d
    4    4    9  e

    >>> df.replace({'A': {0: 100, 4: 400}})
         A  B  C
    0  100  5  a
    1    1  6  b
    2    2  7  c
    3    3  8  d
    4  400  9  e

    **Regular expression `to_replace`**

    >>> df = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
    ...                    'B': ['abc', 'bar', 'xyz']})
    >>> df.replace(to_replace=r'^ba.$', value='new', regex=True)
          A    B
    0   new  abc
    1   foo  new
    2  bait  xyz

    >>> df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True)
          A    B
    0   new  abc
    1   foo  bar
    2  bait  xyz

    >>> df.replace(regex=r'^ba.$', value='new')
          A    B
    0   new  abc
    1   foo  new
    2  bait  xyz

    >>> df.replace(regex={r'^ba.$':'new', 'foo':'xyz'})
          A    B
    0   new  abc
    1   xyz  new
    2  bait  xyz

    >>> df.replace(regex=[r'^ba.$', 'foo'], value='new')
          A    B
    0   new  abc
    1   new  new
    2  bait  xyz

    Note that when replacing multiple ``bool`` or ``datetime64`` objects,
    the data types in the `to_replace` parameter must match the data
    type of the value being replaced:

    >>> df = pd.DataFrame({'A': [True, False, True],
    ...                    'B': [False, True, False]})
    >>> df.replace({'a string': 'new value', True: False})  # raises
    Traceback (most recent call last):
        ...
    TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str'

    This raises a ``TypeError`` because one of the ``dict`` keys is not of
    the correct type for replacement.

    Compare the behavior of ``s.replace({'a': None})`` and
    ``s.replace('a', None)`` to understand the pecularities
    of the `to_replace` parameter:

    >>> s = pd.Series([10, 'a', 'a', 'b', 'a'])

    When one uses a dict as the `to_replace` value, it is like the
    value(s) in the dict are equal to the `value` parameter.
    ``s.replace({'a': None})`` is equivalent to
    ``s.replace(to_replace={'a': None}, value=None, method=None)``:

    >>> s.replace({'a': None})
    0      10
    1    None
    2    None
    3       b
    4    None
    dtype: object

    When ``value=None`` and `to_replace` is a scalar, list or
    tuple, `replace` uses the method parameter (default 'pad') to do the
    replacement. So this is why the 'a' values are being replaced by 10
    in rows 1 and 2 and 'b' in row 4 in this case.
    The command ``s.replace('a', None)`` is actually equivalent to
    ``s.replace(to_replace='a', value=None, method='pad')``:

    >>> s.replace('a', None)
    0    10
    1    10
    2    10
    3     b
    4     b
    dtype: object


df = pd.DataFrame({'one':np.arange(10), 'two':np.arange(2, 12)})
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

0

0

2

1

1

3

2

2

4

3

3

5

4

4

6

5

5

7

6

6

8

7

7

9

8

8

10

9

9

11

df.replace(r'4', 'haha', regex=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

one

two

0

0

2

1

1

3

2

2

4

3

3

5

4

4

6

5

5

7

6

6

8

7

7

9

8

8

10

9

9

11

s = pd.Series(['1', '2', '3'])
s


0    1
1    2
2    3
dtype: object


s.replace(r'3', 4, regex=True)


0    1
1    2
2    4
dtype: object


df = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
             'B': ['abc', 'bar', 'xyz']})
df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

0

bat

abc

1

foo

bar

2

bait

xyz

df.replace(r'ba', 'new', regex=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
{ vertical-align: top }
.dataframe thead th { text-align: right }

A

B

0

newt

abc

1

foo

newr

2

newit

xyz

看来只有str才能用regex

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器