Code Lab: Music Virality Analysis

DataField.Dev

Code Lab: Music Virality Analysis

Why do some songs go viral on social media while others -- perhaps equally well crafted -- never escape obscurity? The answer involves both network dynamics and acoustic properties. In this lab, you will build a simple network diffusion model to simulate how a song spreads through a social graph, analyze which audio features correlate with viral success, and visualize the characteristic S-shaped curve of viral adoption. All models use simulated data, but the patterns they reveal reflect documented trends in real streaming and social media research.

Simulating Audio Features for a Track Catalog

We begin by generating a synthetic dataset of 200 tracks with audio features that loosely follow the distributions observed in real streaming data.

import numpy as np
import matplotlib.pyplot as plt

rng = np.random.default_rng(seed=42)
N_TRACKS = 200

# Generate audio features for each track
tracks = {
    'tempo':        np.clip(rng.normal(118, 18, N_TRACKS), 60, 200),
    'energy':       np.clip(rng.normal(0.65, 0.15, N_TRACKS), 0, 1),
    'danceability': np.clip(rng.normal(0.62, 0.14, N_TRACKS), 0, 1),
    'valence':      np.clip(rng.normal(0.50, 0.18, N_TRACKS), 0, 1),
    'duration_s':   np.clip(rng.normal(195, 40, N_TRACKS), 90, 420),
}

# Simulate a "virality score" influenced by acoustic features
# High energy + high danceability + moderate tempo + short duration = more viral
tempo_factor = np.exp(-((tracks['tempo'] - 115) ** 2) / (2 * 20 ** 2))
virality_raw = (
    20 * tracks['energy']
    + 25 * tracks['danceability']
    + 8  * tracks['valence']
    + 10 * tempo_factor
    - 0.03 * tracks['duration_s']   # shorter tracks favor virality
    + rng.normal(0, 5, N_TRACKS)
)
tracks['virality'] = np.clip(virality_raw, 0, 100)

Correlation Analysis: Which Features Predict Virality?

features = ['tempo', 'energy', 'danceability', 'valence', 'duration_s']
print("Correlation with virality score:")
print("-" * 40)
for feat in features:
    r = np.corrcoef(tracks[feat], tracks['virality'])[0, 1]
    print(f"  {feat:<15} r = {r:+.3f}")

fig, axes = plt.subplots(1, 3, figsize=(13, 4))
for ax, feat, color in zip(axes,
                            ['energy', 'danceability', 'duration_s'],
                            ['#e74c3c', '#3498db', '#2ecc71']):
    ax.scatter(tracks[feat], tracks['virality'], alpha=0.5, s=15,
               color=color, edgecolors='none')
    # Trend line
    z = np.polyfit(tracks[feat], tracks['virality'], 1)
    x_line = np.linspace(np.min(tracks[feat]), np.max(tracks[feat]), 100)
    ax.plot(x_line, np.poly1d(z)(x_line), 'k--', linewidth=1.5)
    r = np.corrcoef(tracks[feat], tracks['virality'])[0, 1]
    ax.set_xlabel(feat.replace('_', ' ').title())
    ax.set_ylabel('Virality Score')
    ax.set_title(f'r = {r:+.3f}', fontsize=10)
    ax.grid(True, alpha=0.3)

fig.suptitle('Audio Features vs. Virality Score', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

Danceability and energy show clear positive correlations, while longer track duration is negatively associated with virality -- consistent with the short-attention-span dynamics of social media feeds.

Network Diffusion Model: How Songs Spread

Viral spread can be modeled as diffusion through a social network. We construct a random graph where each node is a user and edges represent follower relationships. When a user "adopts" (shares) a song, each of their followers has a probability of adopting it on the next time step.

def build_random_network(n_users, avg_connections):
    """Build an adjacency list for a random Erdos-Renyi-like graph."""
    p_connect = avg_connections / n_users
    adjacency = {i: [] for i in range(n_users)}
    for i in range(n_users):
        for j in range(i + 1, n_users):
            if rng.random() < p_connect:
                adjacency[i].append(j)
                adjacency[j].append(i)
    return adjacency

def simulate_spread(adjacency, seed_users, spread_prob, n_steps):
    """
    Simulate viral diffusion through the network.
    Returns a list of cumulative adoption counts at each time step.
    """
    n_users = len(adjacency)
    adopted = set(seed_users)
    history = [len(adopted)]

    for step in range(n_steps):
        new_adopters = set()
        for user in list(adopted):
            for neighbor in adjacency[user]:
                if neighbor not in adopted and rng.random() < spread_prob:
                    new_adopters.add(neighbor)
        adopted.update(new_adopters)
        history.append(len(adopted))
        if len(adopted) == n_users:
            break  # everyone has adopted

    return history

N_USERS = 500
AVG_CONNECTIONS = 8
network = build_random_network(N_USERS, AVG_CONNECTIONS)

# Seed with 3 initial sharers
seed = [0, 1, 2]

Simulating the Viral Spread Curve

We run the diffusion model at three different spread probabilities to see how infectiousness shapes the adoption curve. A higher spread probability corresponds to a more compelling (catchy, danceable, high-energy) track.

fig, ax = plt.subplots(figsize=(9, 5))

for prob, label, color in [(0.03, 'Low virality (p=0.03)', '#3498db'),
                            (0.08, 'Medium virality (p=0.08)', '#f39c12'),
                            (0.15, 'High virality (p=0.15)', '#e74c3c')]:
    history = simulate_spread(network, seed, prob, n_steps=40)
    steps = range(len(history))
    ax.plot(steps, history, color=color, linewidth=2, label=label)
    # Mark the inflection point (fastest growth)
    diffs = np.diff(history)
    if len(diffs) > 0:
        inflection = np.argmax(diffs)
        ax.axvline(inflection, color=color, linestyle=':', alpha=0.4)

ax.set_xlabel('Time Step (share cycles)')
ax.set_ylabel('Cumulative Adopters')
ax.set_title('Network Diffusion: Viral Spread Curves\n'
             '(S-shaped logistic growth with different spread probabilities)')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)
ax.set_xlim(0, 40)
ax.set_ylim(0, N_USERS)
plt.tight_layout()
plt.show()

The characteristic S-shaped curve appears: slow initial growth while the song circulates in a small cluster, then explosive growth as it reaches highly connected nodes, and finally saturation as most of the network has already been exposed. The dotted vertical lines mark the inflection points -- the moments of maximum daily growth, which in real platforms correspond to the song "trending."

Fitting a Logistic Growth Model

The viral S-curve follows logistic dynamics. We can fit the simulated data to the logistic equation to extract the growth rate and carrying capacity.

from scipy.optimize import curve_fit

def logistic(t, L, k, t0):
    """Logistic growth: L = carrying capacity, k = growth rate, t0 = midpoint."""
    return L / (1 + np.exp(-k * (t - t0)))

# Simulate one high-virality run and fit
history = simulate_spread(network, seed, 0.12, n_steps=50)
t_data = np.arange(len(history))

popt, _ = curve_fit(logistic, t_data, history, p0=[N_USERS, 0.5, 15], maxfev=5000)
L_fit, k_fit, t0_fit = popt

print(f"Fitted logistic parameters:")
print(f"  Carrying capacity (L): {L_fit:.0f} users")
print(f"  Growth rate (k):       {k_fit:.3f} per time step")
print(f"  Midpoint (t0):         {t0_fit:.1f} time steps")

t_smooth = np.linspace(0, len(history) - 1, 200)
plt.figure(figsize=(9, 5))
plt.scatter(t_data, history, color='steelblue', s=20, zorder=3, label='Simulated data')
plt.plot(t_smooth, logistic(t_smooth, *popt), 'r-', linewidth=2,
         label=f'Logistic fit (k={k_fit:.2f}, L={L_fit:.0f})')
plt.axhline(L_fit / 2, color='gray', linestyle=':', alpha=0.5,
            label=f'50% adoption (t = {t0_fit:.1f})')
plt.xlabel('Time Step')
plt.ylabel('Cumulative Adopters')
plt.title('Logistic Fit to Viral Spread Simulation')
plt.legend(fontsize=9)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

The logistic fit captures the diffusion dynamics well. The growth rate parameter k maps directly to how "catchy" or shareable a track is -- and our earlier analysis showed that this catchiness correlates with measurable acoustic features.

Try It Yourself

Network topology matters. Replace the random (Erdos-Renyi) network with a scale-free network where a few "influencer" nodes have many more connections than average. You can do this by assigning connection counts from a power-law distribution. How does the presence of influencers change the shape and speed of the viral curve? Does the inflection point shift earlier or later?
Feature threshold experiment. Using the track catalog, divide tracks into "viral" (top 25% by virality score) and "non-viral" (bottom 25%). For each audio feature, compute the mean and standard deviation in both groups. Which feature shows the largest separation between groups? Create a bar chart comparing the two groups across all features.
Decay and re-sharing. Real virality is not permanent -- interest fades. Modify the diffusion model so that adopters "forget" the song after 10 time steps and can be re-infected. How does this change the long-term dynamics? Does the system reach a steady state, oscillate, or die out? Plot the number of active sharers (not cumulative) over time.