3_ctx_th_connection_and_gene_correlation

[1]:

import pandas as pd
import scanpy as sc
from tqdm import tqdm
import matplotlib.pyplot as plt
import anndata as ad
import numpy as np
import warnings
from scipy.stats import pearsonr
import seaborn as sns
from scipy.cluster.hierarchy import fcluster, linkage
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

warnings.filterwarnings('ignore')

[2]:

adj = np.load('/mnt/Data18Td/Data/haichao/mouse_connect_data/NT/zxw/mouse_3/zxw_not_symmetric_adj.npy', mmap_mode='r')
print(adj.shape)

adata = sc.read_h5ad('/mnt/Data16Tc/home/haichao/code/SpaCon/ST_NT_cluster/SpaCon_apply_zxw/data/mouse_3/adata_merge.h5ad')
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
allen_region = pd.read_csv('/mnt/Data16Tc/home/haichao/code/SpaCon/ST_NT_cluster/SpaCon_apply_zxw/data/mouse_3/allen_region.csv')
adata.obs['region'] = allen_region['region'].to_list()
adata.obs['spot_num'] = [i for i in range(adata.n_obs)]
adata

(107909, 107909)

[2]:

AnnData object with n_obs × n_vars = 107909 × 1122
    obs: 'x', 'y', 'z', 'section', 'NT_index', 'Cells_id', 'region', 'spot_num'
    uns: 'log1p'

subregion th ctx correlation

[3]:

th_regions = ['AD', 'AMd', 'AMv', 'AV', 'CL', 'CM', 'IAD', 'IAM', 'IGL', 'IMD', 'LD', 'LGv', 'LH', 'LP', 'MD', 'MGd', 'MGm', 'MGv', 'MH', 'PCN', 'PF', 'PIL', 'PO', 'POL',
                'PP', 'PR', 'PT', 'PVT', 'PoT', 'RE', 'RH', 'RT', 'SGN', 'SMT', 'SPA', 'SPFm', 'SPFp', 'VAL', 'VM', 'VPL', 'VPLpc', 'VPM', 'VPMpc', 'Xi']
ctx_regions = ['ACAd', 'ACAv', 'AId', 'AIp', 'AIv', 'AUDd', 'AUDp', 'AUDpo',
                'AUDv', 'ECT', 'FRP', 'GU', 'ILA', 'MOp', 'MOs', 'ORBl', 'ORBm',
                'ORBvl', 'PERI', 'PL', 'RSPagl', 'RSPd', 'RSPv', 'SSp-bfd',
                'SSp-ll', 'SSp-m', 'SSp-n', 'SSp-tr', 'SSp-ul', 'SSp-un', 'SSs',
                'TEa', 'VISC', 'VISa', 'VISal', 'VISam', 'VISl', 'VISli', 'VISp',
                'VISpl', 'VISpm', 'VISpor', 'VISrl']

adata_ctx_th_area = adata[(adata.obs['region'].str.startswith(tuple(ctx_regions))) | (adata.obs['region'].isin(th_regions))]
adata_ctx_th_area.obs['CT_spot_num'] = [i for i in range(adata_ctx_th_area.n_obs)]
adata_ctx_th_area.obs['ctx_or_th'] = 'nan'
for c in ctx_regions:
    adata_ctx_th_area.obs.loc[adata_ctx_th_area.obs['region'].str.startswith(c), 'ctx_or_th'] = 'ctx'
for c in th_regions:
    adata_ctx_th_area.obs.loc[adata_ctx_th_area.obs['region']==c, 'ctx_or_th'] = 'th'
adata_ctx_th_area.obs

[3]:

	x	y	z	section	NT_index	Cells_id	region	spot_num	CT_spot_num	ctx_or_th
2	19.113418	23.994053	46.894548	Zhuang-ABCA-3.005	2	188949490498702611647640449216501815503_304478...	FRP1	0	0	ctx
4	18.921382	24.017347	49.257099	Zhuang-ABCA-3.004	4	123445075025856302518661678660778224944_131347...	FRP1	1	1	ctx
6	19.079053	23.564575	51.258567	Zhuang-ABCA-3.003	6	319072152716142889780926524544977408256_106358...	FRP1	2	2	ctx
7	18.816739	25.041919	43.046671	Zhuang-ABCA-3.007	7	47102176527243727968506801819481818675_1792889...	FRP1	3	3	ctx
9	18.792692	25.077860	45.101231	Zhuang-ABCA-3.006	9	199428730039264634225478952930502449426_202483...	FRP1	4	4	ctx
...	...	...	...	...	...	...	...	...	...	...
148907	94.906133	15.937666	42.404653	Zhuang-ABCA-3.007	148907	106526352840576802488939912804342681931_107003...	RSPd1	75568	33095	ctx
148919	95.201531	17.993761	40.377543	Zhuang-ABCA-3.008	148919	133921573428284040813862276100319739380_145548...	RSPd1	75574	33096	ctx
148971	96.095220	15.844123	42.457680	Zhuang-ABCA-3.007	148971	149021058656557956290496348449572943108_174675...	RSPd1	75604	33097	ctx
148993	97.188915	11.409533	46.662115	Zhuang-ABCA-3.005	148993	107188929587695452503950023602364017291_167314...	RSPd1	75616	33098	ctx
193053	104.490816	24.420635	25.679869	Zhuang-ABCA-3.015	193053	43354620289828560228522986006306173486	VISpl1	99924	33099	ctx

33100 rows × 10 columns

[4]:

# Generate ROIs

mergecell_num = 20
def cluster_region(df_region, num_points=mergecell_num):
    # Perform hierarchical cluster
    Z = linkage(df_region[['x', 'y', 'z']], method='ward')
    # Divided into clusters according to the results of the cluster, each cluster has about NUM_POINTS individual points
    cluster_labels = fcluster(Z, t=len(df_region) / num_points, criterion='maxclust')
    return cluster_labels

# Initialize a empty cluster label
adata_ctx_th_area.obs['subregion'] = 0

# Perform clusters separately for each area
for region in adata_ctx_th_area.obs['region'].unique():
    mask = adata_ctx_th_area.obs['region'] == region
    if adata_ctx_th_area[adata_ctx_th_area.obs['region'] == region].shape[0] < 15:
        adata_ctx_th_area.obs.loc[mask, 'subregion'] = adata_ctx_th_area.obs.loc[mask, 'region'].apply(lambda x: x + '_1')
        continue

    clu_list = cluster_region(adata_ctx_th_area.obs[mask])
    cl_list = [f'{region}_{i}' for i in clu_list]
    adata_ctx_th_area.obs.loc[mask, 'subregion'] = cl_list

adata_ctx_th_area.obs

[4]:

	x	y	z	section	NT_index	Cells_id	region	spot_num	CT_spot_num	ctx_or_th	subregion
2	19.113418	23.994053	46.894548	Zhuang-ABCA-3.005	2	188949490498702611647640449216501815503_304478...	FRP1	0	0	ctx	FRP1_2
4	18.921382	24.017347	49.257099	Zhuang-ABCA-3.004	4	123445075025856302518661678660778224944_131347...	FRP1	1	1	ctx	FRP1_1
6	19.079053	23.564575	51.258567	Zhuang-ABCA-3.003	6	319072152716142889780926524544977408256_106358...	FRP1	2	2	ctx	FRP1_1
7	18.816739	25.041919	43.046671	Zhuang-ABCA-3.007	7	47102176527243727968506801819481818675_1792889...	FRP1	3	3	ctx	FRP1_2
9	18.792692	25.077860	45.101231	Zhuang-ABCA-3.006	9	199428730039264634225478952930502449426_202483...	FRP1	4	4	ctx	FRP1_2
...	...	...	...	...	...	...	...	...	...	...	...
148907	94.906133	15.937666	42.404653	Zhuang-ABCA-3.007	148907	106526352840576802488939912804342681931_107003...	RSPd1	75568	33095	ctx	RSPd1_3
148919	95.201531	17.993761	40.377543	Zhuang-ABCA-3.008	148919	133921573428284040813862276100319739380_145548...	RSPd1	75574	33096	ctx	RSPd1_3
148971	96.095220	15.844123	42.457680	Zhuang-ABCA-3.007	148971	149021058656557956290496348449572943108_174675...	RSPd1	75604	33097	ctx	RSPd1_3
148993	97.188915	11.409533	46.662115	Zhuang-ABCA-3.005	148993	107188929587695452503950023602364017291_167314...	RSPd1	75616	33098	ctx	RSPd1_2
193053	104.490816	24.420635	25.679869	Zhuang-ABCA-3.015	193053	43354620289828560228522986006306173486	VISpl1	99924	33099	ctx	VISpl1_2

33100 rows × 11 columns

[6]:

regions = adata_ctx_th_area.obs['subregion'].values
gene_expression = adata_ctx_th_area.X.A

# Create a DataFrame to integrate regional information and gene expression matrix together
df = pd.DataFrame(gene_expression, columns=adata.var_names)
df['subregion'] = regions

# Calculate the average gene expression by regional grouping and calculate the average gene
region_mean_expression = df.groupby('subregion').mean()
# Convert the result to the matrix format of the regional*gene
region_gene_matrix = region_mean_expression.values
region_gene_matrix.shape

[6]:

(1719, 1122)

[7]:

corr_all = np.corrcoef(region_gene_matrix)
corr_all.shape

[7]:

(1719, 1719)

[8]:

adata_th = adata_ctx_th_area[adata_ctx_th_area.obs['ctx_or_th'] == 'th']
th_subregions = adata_th.obs['subregion'].unique()
th_subregions.shape

[8]:

(244,)

[9]:

adata_ctx = adata_ctx_th_area[adata_ctx_th_area.obs['ctx_or_th'] == 'ctx']
ctx_subregions = adata_ctx.obs['subregion'].unique()
ctx_subregions.shape

[9]:

(1475,)

[10]:

region_labels = region_mean_expression.index
ctx_indices = [i for i, label in enumerate(region_labels) if label in ctx_subregions]
th_indices = [i for i, label in enumerate(region_labels) if label in th_subregions]
corr = corr_all[np.ix_(ctx_indices, th_indices)]
corr.shape

[10]:

(1475, 244)

[11]:

ctx_regions_df = [region_labels[i] for i in ctx_indices]
th_regions_df = [region_labels[i] for i in th_indices]
corr_df = pd.DataFrame(corr, index=ctx_regions_df, columns=th_regions_df)
corr_df

[11]:

	AD_1	AMd_1	AMd_2	AMv_1	AV_1	AV_2	AV_3	AV_4	AV_5	CL_1	...	VPM_23	VPM_3	VPM_4	VPM_5	VPM_6	VPM_7	VPM_8	VPM_9	VPMpc_1	VPMpc_2
ACAd1_1	0.596044	0.629380	0.629344	0.640548	0.625990	0.616779	0.625027	0.579291	0.607824	0.661248	...	0.546631	0.580518	0.560330	0.552390	0.588024	0.567292	0.550110	0.595391	0.673908	0.651029
ACAd2/3_1	0.597459	0.627501	0.638063	0.645888	0.624184	0.613061	0.610360	0.584868	0.599027	0.632073	...	0.563330	0.591786	0.587295	0.583098	0.590525	0.565894	0.574953	0.613022	0.647487	0.650415
ACAd2/3_2	0.571027	0.595831	0.606491	0.613162	0.593237	0.584572	0.577454	0.553500	0.566546	0.593970	...	0.539473	0.565224	0.567445	0.557278	0.561655	0.535150	0.554062	0.588653	0.610546	0.617166
ACAd2/3_3	0.553607	0.575565	0.594226	0.598235	0.573342	0.568132	0.569627	0.543022	0.554639	0.581051	...	0.514714	0.539524	0.537075	0.534365	0.534878	0.514530	0.527182	0.561045	0.591705	0.611740
ACAd5_1	0.630058	0.641793	0.647781	0.654390	0.634837	0.628741	0.631047	0.606152	0.623477	0.657897	...	0.572636	0.597883	0.591053	0.575654	0.597200	0.584400	0.576911	0.614796	0.669636	0.661564
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
VISrl4_1	0.608806	0.629851	0.637136	0.650792	0.617430	0.627219	0.614035	0.611263	0.604888	0.627469	...	0.577205	0.623548	0.610158	0.606235	0.613583	0.559556	0.614110	0.639994	0.649961	0.646791
VISrl5_1	0.649082	0.647604	0.668424	0.675256	0.653244	0.647948	0.637781	0.631115	0.636032	0.648355	...	0.617031	0.645198	0.637634	0.624230	0.644994	0.610415	0.630391	0.650415	0.684320	0.669412
VISrl5_2	0.643372	0.636174	0.654321	0.670814	0.631363	0.641428	0.625480	0.634172	0.618452	0.648326	...	0.587593	0.635670	0.617376	0.604309	0.627833	0.580243	0.623062	0.645508	0.675442	0.656189
VISrl6a_1	0.587644	0.558709	0.591176	0.606841	0.577877	0.581434	0.567537	0.566168	0.559256	0.581017	...	0.536177	0.560639	0.547816	0.527045	0.555997	0.526904	0.540306	0.558628	0.609119	0.590757
VISrl6b_1	0.657098	0.647202	0.689478	0.700880	0.679191	0.673361	0.654070	0.618012	0.638369	0.687741	...	0.614595	0.632637	0.627957	0.580900	0.653153	0.615239	0.588846	0.612918	0.721750	0.671169

1475 rows × 244 columns

connection

ctx2th conn

[12]:

ctx_ave_list = []
for c in tqdm(ctx_regions_df):
    adata_ctx_area = adata_ctx_th_area[adata_ctx_th_area.obs['subregion'] == c]

    # break
    idx = adata_ctx_area.obs['spot_num'].tolist()
    temp = np.mean(adj[idx], axis=0)
    ctx_ave_list.append(temp)
    # break
ctx2th_ave = np.vstack(ctx_ave_list)
ctx2th_ave.shape

100%|██████████| 1475/1475 [09:42<00:00,  2.53it/s]

[12]:

(1475, 107909)

[13]:

th_ave_list = []
for th_region in tqdm(th_regions_df):
    adata_th_area = adata_ctx_th_area[adata_ctx_th_area.obs['subregion'] == th_region]
    # break
    idx = adata_th_area.obs['spot_num'].tolist()
    # break
    temp = np.mean(ctx2th_ave[:, idx], axis=1)
    th_ave_list.append(temp)

ctx2th_ave = np.column_stack(th_ave_list)
conn_df = pd.DataFrame(data=ctx2th_ave, index=ctx_regions_df, columns=th_regions_df, dtype=float)
ctx2th_ave.shape

100%|██████████| 244/244 [00:00<00:00, 359.77it/s]

[13]:

(1475, 244)

[14]:

corr = corr.flatten()
conn = ctx2th_ave.flatten()
plt.figure(figsize=(8, 6))
ax = sns.scatterplot(conn, corr, s=30, alpha=0.5)

plt.xlabel('Connection_strength', fontweight='bold')
plt.ylabel('Correlation', fontweight='bold')
plt.tight_layout()

_images/3_ctx_th_connection_and_gene_correlation_16_0.png

[28]:

df = pd.DataFrame({'Correlation': corr,
                   'Connection_strength': conn})
# df = df[df['Connection_strength'] > 0.0005]
min_corr = df['Connection_strength'].min()
max_corr = df['Connection_strength'].max()
bins = np.linspace(0, max_corr, 31)
df['group'] = pd.cut(df['Connection_strength'], bins, include_lowest=False)

result = df.groupby('group')['Correlation'].median().reset_index()
result['Connection_strength'] = result['group'].apply(lambda x: (x.left + x.right) / 2)
result = result.dropna(subset=['Correlation'])

import scipy.stats as stats
correlation, p_value = pearsonr(result['Connection_strength'], result['Correlation'])
sns.regplot(data=result, x="Connection_strength", y="Correlation",
            scatter_kws={'s': 30, 'color': '#2878B5', 'alpha': 1, 'marker': 'o'},
            line_kws={'linewidth': 2, 'color': '#D8383A', 'linestyle': '-'})
slope, intercept, r_value, p_value, std_err = stats.linregress(result["Connection_strength"], result["Correlation"])
print(f"(slope): {slope}")
# plt.text(0.95, 0.01, f'Corr: {correlation:.2f}', ha='right', va='bottom', transform=plt.gca().transAxes)
plt.title(f'r = {correlation:.3f},    p = {p_value:.2e}', fontweight='bold')
plt.xlabel('Connection_strength', fontweight='bold')
plt.ylabel('Correlation', fontweight='bold')
plt.savefig('./th_ctx_corr_conn_scatter_line.pdf', format='pdf')
# plt.savefig(f'/mnt/Data16Tc/home/haichao/code/SpaCon/PaperFig/new_brief_comm_fig2/th_ctx_region_correlation/zxw3_all_40bins_merge{mergecell_num}cell.png', dpi=600)

斜率 (slope): 15.42913379110221

_images/3_ctx_th_connection_and_gene_correlation_17_1.png

[33]:

from scipy.stats import gaussian_kde

x = conn
y = corr

# Create a grid
xi, yi = np.mgrid[x.min():x.max():100j, y.min():y.max():100j]

# Calculate nuclear density estimation
positions = np.vstack([xi.ravel(), yi.ravel()])
values = np.vstack([x, y])
kernel = gaussian_kde(values)
z = np.reshape(kernel(positions).T, xi.shape)

# Draw scattered dots and outlines
fig, ax = plt.subplots()
# ax.scatter(x, y, c='k', alpha=0.3)
ax.contour(xi, yi, z, levels=100, cmap='viridis')

plt.show()

_images/3_ctx_th_connection_and_gene_correlation_18_0.png

[34]:

fig, ax = plt.subplots()

contour_set = ax.contour(xi, yi, z, levels=10000, cmap='viridis')

# Get the outermost contour (that is, the first outline)
outermost_contour = contour_set.collections[1]

# Extract the exterior outline path
paths = outermost_contour.get_paths()
outermost_path = paths[0]

# Draw the outermost outline
ax.plot(outermost_path.vertices[:, 0], outermost_path.vertices[:, 1], 'r-', linewidth=2)

plt.show()

# Get the coordinates of the outermost contour
outermost_contour_coords = outermost_path.vertices

Locator attempting to generate 8165 ticks ([0.0, ..., 32656.0]), which exceeds Locator.MAXTICKS (1000).

_images/3_ctx_th_connection_and_gene_correlation_19_1.png

[35]:

x_data = outermost_path.vertices[:, 1]
y_data = outermost_path.vertices[:, 0]

# Select the order of polynomial (you can adjust to get better fitting)
degree = 5

# Use Numpy's polynomial fitting
coeffs = np.polyfit(x_data, y_data, degree)
poly = np.poly1d(coeffs)

# Point to generate the fit curve
x_fit = np.linspace(min(x_data), max(x_data), 100)
y_fit = poly(x_fit)

# Draw the original data point and fit curve
plt.scatter(y_data, x_data, label='Data', s=3)
plt.plot(y_fit, x_fit, 'r-', label=f'Fitted Curve (degree {degree})')
plt.legend()
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Polynomial Fit')
plt.show()

# Print polynomial coefficient
print("Polynomial coefficient (from high -level to low -level):")
for i, coeff in enumerate(coeffs):
    print(f"x^{degree-i}: {coeff:.4f}")

_images/3_ctx_th_connection_and_gene_correlation_20_0.png

多项式系数（从高阶到低阶）:
x^5: 12.8520
x^4: -39.7188
x^3: 48.3887
x^2: -29.1442
x^1: 8.7148
x^0: -1.0373

[97]:

plt.figure(figsize=(8, 6))
ax = sns.scatterplot(conn, corr, s=30, alpha=0.5)
ax.plot(y_fit, x_fit, 'r--', label=f'Fitted Curve (degree {degree})', linewidth=2)
plt.xlabel('Connection_strength', fontweight='bold')
plt.ylabel('Correlation', fontweight='bold')
plt.xlim(0, 0.005)
ax.set_xticks([0.000, 0.001, 0.002, 0.003, 0.004, 0.005])  # Set the X -axis scale
ax.set_yticks([0.2, 0.4, 0.6, 0.8])  # Set y -axial scale
plt.tight_layout()
# plt.savefig('./corr_conn_scatter.png', dpi=600)

_images/3_ctx_th_connection_and_gene_correlation_21_0.png

[ ]:

from scipy.stats import skew, kurtosis, normaltest, jarque_bera
# Polynomial coefficient (from high -level to low -level)
coefficients = coeffs

# Define the polynomial function
poly_func = np.poly1d(coefficients)

# Generate X value
x = np.linspace(min(x_data), max(x_data), 1000)
# Calculate the Y value of polynomial
data = poly_func(x)

# Calculation bias and peak degree
skewness = skew(data)
kurt = kurtosis(data) + 3  # Scipy's Kurtosis function returns the peak degree (Excess Kurtosis), you need to add 3 to get the peak degree

print(f"skewness: {skewness}")
print(f"kurt: {kurt}")

# D'Agostino's K-squared test
k2_stat, k2_p_value = normaltest(data)
print(f"D'Agostino's K-squared: {k2_stat}")
print(f"p: {k2_p_value}")


# Jarque-Bera test
jb_stat, jb_p_value = jarque_bera(data)
print(f"Jarque-Bera: {jb_stat}")
print(f"p: {jb_p_value}")

[17]:

df = pd.DataFrame({'Correlation': corr,
                   'Connection_strength': conn})

min_corr = df['Connection_strength'].min()
max_corr = df['Connection_strength'].max()
bins = np.linspace(min_corr, max_corr, 10)
df['group'] = pd.cut(df['Connection_strength'], bins, include_lowest=True)
df['Connection_strength'] = df['group'].apply(lambda x: round(((x.left + x.right) / 2), 4))
sns.violinplot(x='Connection_strength', y='Correlation', data=df)
plt.xlabel('Connection_strength', fontweight='bold')
plt.ylabel('Correlation', fontweight='bold')
plt.tight_layout()

_images/3_ctx_th_connection_and_gene_correlation_23_0.png