diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 113bfc468a4d7d2e112ee4222bbcbe5a1ae5f8e3..b118744d33823224716a295e6a37990541480b82 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -164,6 +164,9 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 /* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */
 #define IEEE80211_MAX_FRAME_LEN		2352
 
+/* Maximal size of an A-MSDU that can be transported in a HT BA session */
+#define IEEE80211_MAX_MPDU_LEN_HT_BA		4095
+
 /* Maximal size of an A-MSDU */
 #define IEEE80211_MAX_MPDU_LEN_HT_3839		3839
 #define IEEE80211_MAX_MPDU_LEN_HT_7935		7935
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5f4b4c773a920c84d898babae0c79dd1b850cce6..a3ee7655979161c65d04eb80cb441f3de83de1b9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -713,6 +713,7 @@ enum mac80211_tx_info_flags {
  * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
+ * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -720,6 +721,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PORT_CTRL_PROTO	= BIT(0),
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
+	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
 };
 
 /*
@@ -1746,6 +1748,7 @@ struct ieee80211_sta_rates {
  *	Both additional HT limits must be enforced by the low level driver.
  *	This is defined by the spec (IEEE 802.11-2012 section 8.3.2.2 NOTE 2).
  * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not.
+ * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control.
  * @txq: per-TID data TX queues (if driver uses the TXQ abstraction)
  */
 struct ieee80211_sta {
@@ -1767,6 +1770,7 @@ struct ieee80211_sta {
 	u8 max_amsdu_subframes;
 	u16 max_amsdu_len;
 	bool support_p2p_ps;
+	u16 max_rc_amsdu_len;
 
 	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS];
 
@@ -1983,6 +1987,15 @@ struct ieee80211_txq {
  * @IEEE80211_HW_USES_RSS: The device uses RSS and thus requires parallel RX,
  *	which implies using per-CPU station statistics.
  *
+ * @IEEE80211_HW_TX_AMSDU: Hardware (or driver) supports software aggregated
+ *	A-MSDU frames. Requires software tx queueing and fast-xmit support.
+ *	When not using minstrel/minstrel_ht rate control, the driver must
+ *	limit the maximum A-MSDU size based on the current tx rate by setting
+ *	max_rc_amsdu_len in struct ieee80211_sta.
+ *
+ * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list
+ *	skbs, needed for zero-copy software A-MSDU.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2021,6 +2034,8 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR,
 	IEEE80211_HW_SUPPORTS_REORDERING_BUFFER,
 	IEEE80211_HW_USES_RSS,
+	IEEE80211_HW_TX_AMSDU,
+	IEEE80211_HW_TX_FRAG_LIST,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
@@ -2093,6 +2108,9 @@ enum ieee80211_hw_flags {
  *	size is smaller (an example is LinkSys WRT120N with FW v1.0.07
  *	build 002 Jun 18 2012).
  *
+ * @max_tx_fragments: maximum number of tx buffers per (A)-MSDU, sum
+ *	of 1 + skb_shinfo(skb)->nr_frags for each skb in the frag_list.
+ *
  * @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX
  *	(if %IEEE80211_HW_QUEUE_CONTROL is set)
  *
@@ -2147,6 +2165,7 @@ struct ieee80211_hw {
 	u8 max_rate_tries;
 	u8 max_rx_aggregation_subframes;
 	u8 max_tx_aggregation_subframes;
+	u8 max_tx_fragments;
 	u8 offchannel_tx_hw_queue;
 	u8 radiotap_mcs_details;
 	u16 radiotap_vht_details;
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 4932e9f243a2cb9e156e7e24ac1c098fc9a4b19c..42fa81031dfa94b10352864778542bbb471998d5 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -935,6 +935,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
 				  size_t len)
 {
 	struct tid_ampdu_tx *tid_tx;
+	struct ieee80211_txq *txq;
 	u16 capab, tid;
 	u8 buf_size;
 	bool amsdu;
@@ -945,6 +946,10 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
 	buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
 	buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);
 
+	txq = sta->sta.txq[tid];
+	if (!amsdu && txq)
+		set_bit(IEEE80211_TXQ_NO_AMSDU, &to_txq_info(txq)->flags);
+
 	mutex_lock(&sta->ampdu_mlme.mtx);
 
 	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 52ed2afc408dc583378572035968d187d0f413fe..b251b2f7f8dd8c6859ec5d32530e741530835f40 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -128,6 +128,8 @@ static const char *hw_flag_names[] = {
 	FLAG(NEEDS_UNIQUE_STA_ADDR),
 	FLAG(SUPPORTS_REORDERING_BUFFER),
 	FLAG(USES_RSS),
+	FLAG(TX_AMSDU),
+	FLAG(TX_FRAG_LIST),
 #undef FLAG
 };
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6243109979ed5a8e28eee9c49bef1c1630a7723f..40c1d343992cfa0c0afc02404acf8fbdf5297aa9 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -802,6 +802,7 @@ struct mac80211_qos_map {
 enum txq_info_flags {
 	IEEE80211_TXQ_STOP,
 	IEEE80211_TXQ_AMPDU,
+	IEEE80211_TXQ_NO_AMSDU,
 };
 
 struct txq_info {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index cf2aca0cc200e635ae82b901d9c921ab97b59be1..960e13d8ed304570efd7bdeb2299be620498bff3 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -416,6 +416,8 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
+	sta->sta.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA;
+
 	sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr);
 
 	return sta;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 597c8fe672a35e69355ee8bc0c3514f9514f2dce..4fa2842ddb25599eddc36ba23e3d0b6ed20b1c41 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1324,6 +1324,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 out:
 	spin_unlock_bh(&txqi->queue.lock);
 
+	if (skb && skb_has_frag_list(skb) &&
+	    !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
+		skb_linearize(skb);
+
 	return skb;
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
@@ -2802,6 +2806,154 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta)
 		kfree_rcu(fast_tx, rcu_head);
 }
 
+static bool ieee80211_amsdu_realloc_pad(struct ieee80211_local *local,
+					struct sk_buff *skb, int headroom,
+					int *subframe_len)
+{
+	int amsdu_len = *subframe_len + sizeof(struct ethhdr);
+	int padding = (4 - amsdu_len) & 3;
+
+	if (skb_headroom(skb) < headroom || skb_tailroom(skb) < padding) {
+		I802_DEBUG_INC(local->tx_expand_skb_head);
+
+		if (pskb_expand_head(skb, headroom, padding, GFP_ATOMIC)) {
+			wiphy_debug(local->hw.wiphy,
+				    "failed to reallocate TX buffer\n");
+			return false;
+		}
+	}
+
+	if (padding) {
+		*subframe_len += padding;
+		memset(skb_put(skb, padding), 0, padding);
+	}
+
+	return true;
+}
+
+static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
+					 struct ieee80211_fast_tx *fast_tx,
+					 struct sk_buff *skb)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr;
+	struct ethhdr amsdu_hdr;
+	int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
+	int subframe_len = skb->len - hdr_len;
+	void *data;
+	u8 *qc;
+
+	if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE)
+		return false;
+
+	if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
+		return true;
+
+	if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(amsdu_hdr),
+					 &subframe_len))
+		return false;
+
+	amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
+	memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
+	memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
+
+	data = skb_push(skb, sizeof(amsdu_hdr));
+	memmove(data, data + sizeof(amsdu_hdr), hdr_len);
+	memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
+
+	hdr = data;
+	qc = ieee80211_get_qos_ctl(hdr);
+	*qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
+
+	info->control.flags |= IEEE80211_TX_CTRL_AMSDU;
+
+	return true;
+}
+
+static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
+				      struct sta_info *sta,
+				      struct ieee80211_fast_tx *fast_tx,
+				      struct sk_buff *skb)
+{
+	struct ieee80211_local *local = sdata->local;
+	u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+	struct ieee80211_txq *txq = sta->sta.txq[tid];
+	struct txq_info *txqi;
+	struct sk_buff **frag_tail, *head;
+	int subframe_len = skb->len - ETH_ALEN;
+	u8 max_subframes = sta->sta.max_amsdu_subframes;
+	int max_frags = local->hw.max_tx_fragments;
+	int max_amsdu_len = sta->sta.max_amsdu_len;
+	__be16 len;
+	void *data;
+	bool ret = false;
+	int n = 1, nfrags;
+
+	if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
+		return false;
+
+	if (!txq)
+		return false;
+
+	txqi = to_txq_info(txq);
+	if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
+		return false;
+
+	if (sta->sta.max_rc_amsdu_len)
+		max_amsdu_len = min_t(int, max_amsdu_len,
+				      sta->sta.max_rc_amsdu_len);
+
+	spin_lock_bh(&txqi->queue.lock);
+
+	head = skb_peek_tail(&txqi->queue);
+	if (!head)
+		goto out;
+
+	if (skb->len + head->len > max_amsdu_len)
+		goto out;
+
+	if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
+		goto out;
+
+	nfrags = 1 + skb_shinfo(skb)->nr_frags;
+	nfrags += 1 + skb_shinfo(head)->nr_frags;
+	frag_tail = &skb_shinfo(head)->frag_list;
+	while (*frag_tail) {
+		nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags;
+		frag_tail = &(*frag_tail)->next;
+		n++;
+	}
+
+	if (max_subframes && n > max_subframes)
+		goto out;
+
+	if (max_frags && nfrags > max_frags)
+		goto out;
+
+	if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) + 2,
+					 &subframe_len))
+		goto out;
+
+	ret = true;
+	data = skb_push(skb, ETH_ALEN + 2);
+	memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN);
+
+	data += 2 * ETH_ALEN;
+	len = cpu_to_be16(subframe_len);
+	memcpy(data, &len, 2);
+	memcpy(data + 2, rfc1042_header, sizeof(rfc1042_header));
+
+	head->len += skb->len;
+	head->data_len += skb->len;
+	*frag_tail = skb;
+
+out:
+	spin_unlock_bh(&txqi->queue.lock);
+
+	return ret;
+}
+
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 				struct net_device *dev, struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
@@ -2856,6 +3008,10 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_tx_stats(dev, skb->len + extra_head);
 
+	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
+	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
+		return true;
+
 	/* will not be crypto-handled beyond what we do here, so use false
 	 * as the may-encrypt argument for the resize to not account for
 	 * more room than we already have in 'extra_head'