summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Marineau <marineam@gentoo.org>2008-02-23 23:34:18 +0000
committerMichael Marineau <marineam@gentoo.org>2008-02-23 23:34:18 +0000
commit61d3b98bad64bdb6e8f61dec73ee540beb49539a (patch)
treef835b8168710e60d582fbc366fd6c67dfd785ea1
parentReplace the 2.6.22 patches with Suse's xen patchset which actually works. :-) (diff)
downloadxen-61d3b98bad64bdb6e8f61dec73ee540beb49539a.tar.gz
xen-61d3b98bad64bdb6e8f61dec73ee540beb49539a.tar.bz2
xen-61d3b98bad64bdb6e8f61dec73ee540beb49539a.zip
Releasing 2.6.22-2
svn path=/patches/; revision=76
-rw-r--r--tags/2.6.22-2/00000_README36
-rw-r--r--tags/2.6.22-2/01012_linux-2.6.22.13.patch42
-rw-r--r--tags/2.6.22-2/01013_linux-2.6.22.14.patch1319
-rw-r--r--tags/2.6.22-2/01014_linux-2.6.22.15.patch1096
-rw-r--r--tags/2.6.22-2/01015_linux-2.6.22.16.patch27
-rw-r--r--tags/2.6.22-2/01016_linux-2.6.22.17.patch1360
-rw-r--r--tags/2.6.22-2/01017_linux-2.6.22.18.patch14
-rw-r--r--tags/2.6.22-2/20001_x86-early-quirks-unificiation.patch1237
-rw-r--r--tags/2.6.22-2/20002_add-console-use-vt.patch158
-rw-r--r--tags/2.6.22-2/20003_linux-2.6.19-rc1-kexec-move_segment_code-i386.patch1172
-rw-r--r--tags/2.6.22-2/20004_linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch1164
-rw-r--r--tags/2.6.22-2/20005_blktap-aio-16_03_06.patch1209
-rw-r--r--tags/2.6.22-2/20006_fix-ide-cd-pio-mode.patch136
-rw-r--r--tags/2.6.22-2/20007_i386-mach-io-check-nmi.patch153
-rw-r--r--tags/2.6.22-2/20008_net-csum.patch150
-rw-r--r--tags/2.6.22-2/20009_xenoprof-generic.patch1669
-rw-r--r--tags/2.6.22-2/20010_softlockup-no-idle-hz.patch175
-rw-r--r--tags/2.6.22-2/20011_xen3-auto-xen-arch.patch149825
-rw-r--r--tags/2.6.22-2/20012_xen3-auto-xen-drivers.patch128404
-rw-r--r--tags/2.6.22-2/20013_xen3-auto-include-xen-interface.patch18771
-rw-r--r--tags/2.6.22-2/20014_xen3-auto-xen-kconfig.patch1887
-rw-r--r--tags/2.6.22-2/20015_xen3-auto-common.patch12101
-rw-r--r--tags/2.6.22-2/20016_xen3-auto-arch-i386.patch1483
-rw-r--r--tags/2.6.22-2/20017_xen3-auto-arch-x86_64.patch1502
-rw-r--r--tags/2.6.22-2/20018_15130-x86_64-vsyscall-user.patch151
-rw-r--r--tags/2.6.22-2/20019_15181-dma-tracking.patch1551
-rw-r--r--tags/2.6.22-2/20020_30-bit-field-booleans.patch138
-rw-r--r--tags/2.6.22-2/20021_42-freeze.patch167
-rw-r--r--tags/2.6.22-2/20022_67-edd.patch1209
-rw-r--r--tags/2.6.22-2/20023_70-edid.patch1118
-rw-r--r--tags/2.6.22-2/20024_79-balloon-highmem.patch142
-rw-r--r--tags/2.6.22-2/20025_80-blk-teardown.patch157
-rw-r--r--tags/2.6.22-2/20026_81-clock-was-set.patch148
-rw-r--r--tags/2.6.22-2/20027_82-blkdev-wait.patch192
-rw-r--r--tags/2.6.22-2/20028_93-swiotlb.patch1146
-rw-r--r--tags/2.6.22-2/20029_95-privcmd-wrlock.patch172
-rw-r--r--tags/2.6.22-2/20030_136-pae-vmalloc-sync-all.patch163
-rw-r--r--tags/2.6.22-2/20031_137-netfront-copy-release.patch1128
-rw-r--r--tags/2.6.22-2/20032_141-driver-autoload.patch1120
-rw-r--r--tags/2.6.22-2/20033_144-xenbus-dev-wait.patch1104
-rw-r--r--tags/2.6.22-2/20034_145-xenbus-error-path.patch124
-rw-r--r--tags/2.6.22-2/20035_148-blkfront-no-bounce-bufs.patch125
-rw-r--r--tags/2.6.22-2/20036_152-netloop-check-cloned-skb.patch135
-rw-r--r--tags/2.6.22-2/20037_157-netfront-skb-deref.patch135
-rw-r--r--tags/2.6.22-2/20038_252-l1-entry-update-highpte.patch127
-rw-r--r--tags/2.6.22-2/20039_265-ptep_get_and_clear.patch174
-rw-r--r--tags/2.6.22-2/20040_xen3-fixup-common.patch1365
-rw-r--r--tags/2.6.22-2/20041_xen3-fixup-arch-i386.patch176
-rw-r--r--tags/2.6.22-2/20042_xen3-fixup-arch-x86_64.patch1103
-rw-r--r--tags/2.6.22-2/20043_xen3-patch-2.6.18.patch1394
-rw-r--r--tags/2.6.22-2/20044_xen3-patch-2.6.19.patch112637
-rw-r--r--tags/2.6.22-2/20045_xen3-patch-2.6.20.patch17592
-rw-r--r--tags/2.6.22-2/20046_xen3-patch-2.6.21.patch15107
-rw-r--r--tags/2.6.22-2/20047_xen3-patch-2.6.22.patch17866
-rw-r--r--tags/2.6.22-2/20048_xen3-patch-2.6.22.5-6.patch130
-rw-r--r--tags/2.6.22-2/20049_xen3-patch-2.6.22.6-7.patch157
-rw-r--r--tags/2.6.22-2/20050_xen3-patch-2.6.22.11-12.patch188
-rw-r--r--tags/2.6.22-2/20051_xen3-x86-early-quirks-unificiation.patch125
-rw-r--r--tags/2.6.22-2/20052_xen3-x86-fam10-l3cache.patch128
-rw-r--r--tags/2.6.22-2/20053_xen3-aux-at_vector_size.patch147
-rw-r--r--tags/2.6.22-2/20054_xen-balloon-min.patch177
-rw-r--r--tags/2.6.22-2/20055_xen-modular-blktap.patch141
-rw-r--r--tags/2.6.22-2/20056_xen-x86-panic-no-reboot.patch158
-rw-r--r--tags/2.6.22-2/20057_xen-i386-panic-on-oops.patch127
-rw-r--r--tags/2.6.22-2/20058_xen-x86-kconfig-no-cpu_freq.patch135
-rw-r--r--tags/2.6.22-2/20059_xen-configurable-console.patch1181
-rw-r--r--tags/2.6.22-2/20060_xen-x86_64-init-cleanup.patch1294
-rw-r--r--tags/2.6.22-2/20061_xen-balloon-max-target.patch132
-rw-r--r--tags/2.6.22-2/20062_xen-x86-dcr-fallback.patch1158
-rw-r--r--tags/2.6.22-2/20063_xen-x86-consistent-nmi.patch1345
-rw-r--r--tags/2.6.22-2/20064_xen-x86-no-lapic.patch11426
-rw-r--r--tags/2.6.22-2/20065_xen-no-video-select.patch121
-rw-r--r--tags/2.6.22-2/20066_xen-blkback-bimodal-suse.patch139
-rw-r--r--tags/2.6.22-2/20067_xen-console-default.patch141
-rw-r--r--tags/2.6.22-2/20068_xen-x86-panic-smp.patch196
-rw-r--r--tags/2.6.22-2/20069_xen-split-pt-lock.patch1220
-rw-r--r--tags/2.6.22-2/20070_xen-blkif-protocol-fallback-hack.patch1229
-rw-r--r--tags/2.6.22-2/20071_xen-x86-pXX_val.patch1434
-rw-r--r--tags/2.6.22-2/20072_xen-x86_64-physmap-nx.patch136
-rw-r--r--tags/2.6.22-2/20073_xen-i386-kconfig-msr.patch118
-rw-r--r--tags/2.6.22-2/20074_xen-x86_64-entry.patch142
-rw-r--r--tags/2.6.22-2/20075_xen-intel-agp.patch133
-rw-r--r--tags/2.6.22-2/20076_xen-blkback-cdrom.patch1277
-rw-r--r--tags/2.6.22-2/20077_xen-isa-dma.patch1543
-rw-r--r--tags/2.6.22-2/20078_xen-i386-set-fixmap.patch1126
-rw-r--r--tags/2.6.22-2/20079_xenfb-module-param.patch1108
-rw-r--r--tags/2.6.22-2/50001_make-install.patch52
-rw-r--r--tags/2.6.22-2/50002_always-enable-xen-genapic.patch12
-rw-r--r--tags/2.6.22-2/50009_gentooify-tls-warning.patch16
89 files changed, 138148 insertions, 0 deletions
diff --git a/tags/2.6.22-2/00000_README b/tags/2.6.22-2/00000_README
new file mode 100644
index 0000000..82c139d
--- /dev/null
+++ b/tags/2.6.22-2/00000_README
@@ -0,0 +1,36 @@
+Xen Patches README
+------------------
+
+These patches are intended to be stacked on top of genpatches-base.
+
+Many of the patches included here are swiped from various sources which
+use their own four digit patch numbering scheme, so we are stuck with five
+digits to indiciate the source for easier tracking and re-syncing.
+
+Numbering
+---------
+
+0xxxx Gentoo, not related to Xen. (in case we pull something from extras)
+2xxxx Suse, we are using their Xen patch for 2.6.22
+5xxxx Gentoo, Xen and other fixes for Redhat and/or Debian patches.
+
+Patches
+-------
+
+0xxxx_linux-2.6.22.???
+ Kernel.org 2.6.22.y releases that are not included in genpatches.
+
+2xxxx_???
+ Xen patches from Suse's kernel. Note that they are named *.patch1
+ to make sure unipatch does the correct thing.
+
+50001_make-install.patch
+ Handle make install in a semi-sane way that plays nice with
+ split domU/dom0 kernels.
+
+50002_always-enable-xen-genapic.patch
+ Compile fix for non-SMP (UP) kernels. Since UP support is broken in
+ upstream Xen I'm not sure if I trust it or not. :-P
+
+50009_gentooify-tls-warning.patch
+ Change tls warning instructions to apply directly to Gentoo.
diff --git a/tags/2.6.22-2/01012_linux-2.6.22.13.patch b/tags/2.6.22-2/01012_linux-2.6.22.13.patch
new file mode 100644
index 0000000..cfd8333
--- /dev/null
+++ b/tags/2.6.22-2/01012_linux-2.6.22.13.patch
@@ -0,0 +1,42 @@
+Subject: Linux 2.6.22.13
+From: Greg Kroah-Hartman <gregkh@suse.de>
+
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+diff --git a/kernel/exit.c b/kernel/exit.c
+index 5c8ecba..e3adc46 100644
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -1336,8 +1336,7 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
+ int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
+
+ exit_code = p->exit_code;
+- if (unlikely(!exit_code) ||
+- unlikely(p->state & TASK_TRACED))
++ if (unlikely(!exit_code) || unlikely(p->exit_state))
+ goto bail_ref;
+ return wait_noreap_copyout(p, pid, uid,
+ why, (exit_code << 8) | 0x7f,
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index e33fb3d..2e1d8e7 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -994,6 +994,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
+ if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
+ return 0;
+
++ if (!tp->packets_out)
++ goto out;
++
+ /* SACK fastpath:
+ * if the only SACK change is the increase of the end_seq of
+ * the first block then only apply that SACK block
+@@ -1262,6 +1265,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
+ (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
+ tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
+
++out:
++
+ #if FASTRETRANS_DEBUG > 0
+ BUG_TRAP((int)tp->sacked_out >= 0);
+ BUG_TRAP((int)tp->lost_out >= 0);
diff --git a/tags/2.6.22-2/01013_linux-2.6.22.14.patch b/tags/2.6.22-2/01013_linux-2.6.22.14.patch
new file mode 100644
index 0000000..aea3379
--- /dev/null
+++ b/tags/2.6.22-2/01013_linux-2.6.22.14.patch
@@ -0,0 +1,1319 @@
+Subject: Linux 2.6.22.14
+From: Greg Kroah-Hartman <gregkh@suse.de>
+
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
+index f64b81f..8e02ed6 100644
+--- a/arch/i386/kernel/tsc.c
++++ b/arch/i386/kernel/tsc.c
+@@ -122,7 +122,7 @@ unsigned long native_calculate_cpu_khz(void)
+ {
+ unsigned long long start, end;
+ unsigned long count;
+- u64 delta64;
++ u64 delta64 = (u64)ULLONG_MAX;
+ int i;
+ unsigned long flags;
+
+@@ -134,6 +134,7 @@ unsigned long native_calculate_cpu_khz(void)
+ rdtscll(start);
+ mach_countup(&count);
+ rdtscll(end);
++ delta64 = min(delta64, (end - start));
+ }
+ /*
+ * Error: ECTCNEVERSET
+@@ -144,8 +145,6 @@ unsigned long native_calculate_cpu_khz(void)
+ if (count <= 1)
+ goto err;
+
+- delta64 = end - start;
+-
+ /* cpu freq too fast: */
+ if (delta64 > (1ULL<<32))
+ goto err;
+diff --git a/drivers/i2c/busses/i2c-pasemi.c b/drivers/i2c/busses/i2c-pasemi.c
+index 58e3271..dcf5dec 100644
+--- a/drivers/i2c/busses/i2c-pasemi.c
++++ b/drivers/i2c/busses/i2c-pasemi.c
+@@ -51,6 +51,7 @@ struct pasemi_smbus {
+ #define MRXFIFO_DATA_M 0x000000ff
+
+ #define SMSTA_XEN 0x08000000
++#define SMSTA_MTN 0x00200000
+
+ #define CTL_MRR 0x00000400
+ #define CTL_MTR 0x00000200
+@@ -98,6 +99,10 @@ static unsigned int pasemi_smb_waitready(struct pasemi_smbus *smbus)
+ status = reg_read(smbus, REG_SMSTA);
+ }
+
++ /* Got NACK? */
++ if (status & SMSTA_MTN)
++ return -ENXIO;
++
+ if (timeout < 0) {
+ dev_warn(&smbus->dev->dev, "Timeout, status 0x%08x\n", status);
+ reg_write(smbus, REG_SMSTA, status);
+diff --git a/drivers/i2c/chips/eeprom.c b/drivers/i2c/chips/eeprom.c
+index bfce13c..5ad36ab 100644
+--- a/drivers/i2c/chips/eeprom.c
++++ b/drivers/i2c/chips/eeprom.c
+@@ -125,13 +125,20 @@ static ssize_t eeprom_read(struct kobject *kobj, char *buf, loff_t off, size_t c
+ for (slice = off >> 5; slice <= (off + count - 1) >> 5; slice++)
+ eeprom_update_client(client, slice);
+
+- /* Hide Vaio security settings to regular users (16 first bytes) */
+- if (data->nature == VAIO && off < 16 && !capable(CAP_SYS_ADMIN)) {
+- size_t in_row1 = 16 - off;
+- in_row1 = min(in_row1, count);
+- memset(buf, 0, in_row1);
+- if (count - in_row1 > 0)
+- memcpy(buf + in_row1, &data->data[16], count - in_row1);
++ /* Hide Vaio private settings to regular users:
++ - BIOS passwords: bytes 0x00 to 0x0f
++ - UUID: bytes 0x10 to 0x1f
++ - Serial number: 0xc0 to 0xdf */
++ if (data->nature == VAIO && !capable(CAP_SYS_ADMIN)) {
++ int i;
++
++ for (i = 0; i < count; i++) {
++ if ((off + i <= 0x1f) ||
++ (off + i >= 0xc0 && off + i <= 0xdf))
++ buf[i] = 0;
++ else
++ buf[i] = data->data[off + i];
++ }
+ } else {
+ memcpy(buf, &data->data[off], count);
+ }
+@@ -195,14 +202,18 @@ static int eeprom_detect(struct i2c_adapter *adapter, int address, int kind)
+ goto exit_kfree;
+
+ /* Detect the Vaio nature of EEPROMs.
+- We use the "PCG-" prefix as the signature. */
++ We use the "PCG-" or "VGN-" prefix as the signature. */
+ if (address == 0x57) {
+- if (i2c_smbus_read_byte_data(new_client, 0x80) == 'P'
+- && i2c_smbus_read_byte(new_client) == 'C'
+- && i2c_smbus_read_byte(new_client) == 'G'
+- && i2c_smbus_read_byte(new_client) == '-') {
++ char name[4];
++
++ name[0] = i2c_smbus_read_byte_data(new_client, 0x80);
++ name[1] = i2c_smbus_read_byte(new_client);
++ name[2] = i2c_smbus_read_byte(new_client);
++ name[3] = i2c_smbus_read_byte(new_client);
++
++ if (!memcmp(name, "PCG-", 4) || !memcmp(name, "VGN-", 4)) {
+ dev_info(&new_client->dev, "Vaio EEPROM detected, "
+- "enabling password protection\n");
++ "enabling privacy protection\n");
+ data->nature = VAIO;
+ }
+ }
+diff --git a/drivers/ide/pci/serverworks.c b/drivers/ide/pci/serverworks.c
+index d9c4fd1..096a081 100644
+--- a/drivers/ide/pci/serverworks.c
++++ b/drivers/ide/pci/serverworks.c
+@@ -101,6 +101,7 @@ static u8 svwks_udma_filter(ide_drive_t *drive)
+ mode = 2;
+
+ switch(mode) {
++ case 3: mask = 0x3f; break;
+ case 2: mask = 0x1f; break;
+ case 1: mask = 0x07; break;
+ default: mask = 0x00; break;
+diff --git a/drivers/isdn/hardware/avm/b1.c b/drivers/isdn/hardware/avm/b1.c
+index 7a69a18..4484a64 100644
+--- a/drivers/isdn/hardware/avm/b1.c
++++ b/drivers/isdn/hardware/avm/b1.c
+@@ -321,12 +321,15 @@ void b1_reset_ctr(struct capi_ctr *ctrl)
+ avmctrl_info *cinfo = (avmctrl_info *)(ctrl->driverdata);
+ avmcard *card = cinfo->card;
+ unsigned int port = card->port;
++ unsigned long flags;
+
+ b1_reset(port);
+ b1_reset(port);
+
+ memset(cinfo->version, 0, sizeof(cinfo->version));
++ spin_lock_irqsave(&card->lock, flags);
+ capilib_release(&cinfo->ncci_head);
++ spin_unlock_irqrestore(&card->lock, flags);
+ capi_ctr_reseted(ctrl);
+ }
+
+@@ -361,9 +364,8 @@ void b1_release_appl(struct capi_ctr *ctrl, u16 appl)
+ unsigned int port = card->port;
+ unsigned long flags;
+
+- capilib_release_appl(&cinfo->ncci_head, appl);
+-
+ spin_lock_irqsave(&card->lock, flags);
++ capilib_release_appl(&cinfo->ncci_head, appl);
+ b1_put_byte(port, SEND_RELEASE);
+ b1_put_word(port, appl);
+ spin_unlock_irqrestore(&card->lock, flags);
+@@ -380,27 +382,27 @@ u16 b1_send_message(struct capi_ctr *ctrl, struct sk_buff *skb)
+ u8 subcmd = CAPIMSG_SUBCOMMAND(skb->data);
+ u16 dlen, retval;
+
++ spin_lock_irqsave(&card->lock, flags);
+ if (CAPICMD(cmd, subcmd) == CAPI_DATA_B3_REQ) {
+ retval = capilib_data_b3_req(&cinfo->ncci_head,
+ CAPIMSG_APPID(skb->data),
+ CAPIMSG_NCCI(skb->data),
+ CAPIMSG_MSGID(skb->data));
+- if (retval != CAPI_NOERROR)
++ if (retval != CAPI_NOERROR) {
++ spin_unlock_irqrestore(&card->lock, flags);
+ return retval;
++ }
+
+ dlen = CAPIMSG_DATALEN(skb->data);
+
+- spin_lock_irqsave(&card->lock, flags);
+ b1_put_byte(port, SEND_DATA_B3_REQ);
+ b1_put_slice(port, skb->data, len);
+ b1_put_slice(port, skb->data + len, dlen);
+- spin_unlock_irqrestore(&card->lock, flags);
+ } else {
+- spin_lock_irqsave(&card->lock, flags);
+ b1_put_byte(port, SEND_MESSAGE);
+ b1_put_slice(port, skb->data, len);
+- spin_unlock_irqrestore(&card->lock, flags);
+ }
++ spin_unlock_irqrestore(&card->lock, flags);
+
+ dev_kfree_skb_any(skb);
+ return CAPI_NOERROR;
+@@ -534,17 +536,17 @@ irqreturn_t b1_interrupt(int interrupt, void *devptr)
+
+ ApplId = (unsigned) b1_get_word(card->port);
+ MsgLen = b1_get_slice(card->port, card->msgbuf);
+- spin_unlock_irqrestore(&card->lock, flags);
+ if (!(skb = alloc_skb(MsgLen, GFP_ATOMIC))) {
+ printk(KERN_ERR "%s: incoming packet dropped\n",
+ card->name);
++ spin_unlock_irqrestore(&card->lock, flags);
+ } else {
+ memcpy(skb_put(skb, MsgLen), card->msgbuf, MsgLen);
+ if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_CONF)
+ capilib_data_b3_conf(&cinfo->ncci_head, ApplId,
+ CAPIMSG_NCCI(skb->data),
+ CAPIMSG_MSGID(skb->data));
+-
++ spin_unlock_irqrestore(&card->lock, flags);
+ capi_ctr_handle_message(ctrl, ApplId, skb);
+ }
+ break;
+@@ -554,21 +556,17 @@ irqreturn_t b1_interrupt(int interrupt, void *devptr)
+ ApplId = b1_get_word(card->port);
+ NCCI = b1_get_word(card->port);
+ WindowSize = b1_get_word(card->port);
+- spin_unlock_irqrestore(&card->lock, flags);
+-
+ capilib_new_ncci(&cinfo->ncci_head, ApplId, NCCI, WindowSize);
+-
++ spin_unlock_irqrestore(&card->lock, flags);
+ break;
+
+ case RECEIVE_FREE_NCCI:
+
+ ApplId = b1_get_word(card->port);
+ NCCI = b1_get_word(card->port);
+- spin_unlock_irqrestore(&card->lock, flags);
+-
+ if (NCCI != 0xffffffff)
+ capilib_free_ncci(&cinfo->ncci_head, ApplId, NCCI);
+-
++ spin_unlock_irqrestore(&card->lock, flags);
+ break;
+
+ case RECEIVE_START:
+diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c
+index d58f927..8710cf6 100644
+--- a/drivers/isdn/hardware/avm/c4.c
++++ b/drivers/isdn/hardware/avm/c4.c
+@@ -727,6 +727,7 @@ static void c4_send_init(avmcard *card)
+ {
+ struct sk_buff *skb;
+ void *p;
++ unsigned long flags;
+
+ skb = alloc_skb(15, GFP_ATOMIC);
+ if (!skb) {
+@@ -744,12 +745,15 @@ static void c4_send_init(avmcard *card)
+ skb_put(skb, (u8 *)p - (u8 *)skb->data);
+
+ skb_queue_tail(&card->dma->send_queue, skb);
++ spin_lock_irqsave(&card->lock, flags);
+ c4_dispatch_tx(card);
++ spin_unlock_irqrestore(&card->lock, flags);
+ }
+
+ static int queue_sendconfigword(avmcard *card, u32 val)
+ {
+ struct sk_buff *skb;
++ unsigned long flags;
+ void *p;
+
+ skb = alloc_skb(3+4, GFP_ATOMIC);
+@@ -766,7 +770,9 @@ static int queue_sendconfigword(avmcard *card, u32 val)
+ skb_put(skb, (u8 *)p - (u8 *)skb->data);
+
+ skb_queue_tail(&card->dma->send_queue, skb);
++ spin_lock_irqsave(&card->lock, flags);
+ c4_dispatch_tx(card);
++ spin_unlock_irqrestore(&card->lock, flags);
+ return 0;
+ }
+
+@@ -986,7 +992,9 @@ static void c4_release_appl(struct capi_ctr *ctrl, u16 appl)
+ struct sk_buff *skb;
+ void *p;
+
++ spin_lock_irqsave(&card->lock, flags);
+ capilib_release_appl(&cinfo->ncci_head, appl);
++ spin_unlock_irqrestore(&card->lock, flags);
+
+ if (ctrl->cnr == card->cardnr) {
+ skb = alloc_skb(7, GFP_ATOMIC);
+@@ -1019,7 +1027,8 @@ static u16 c4_send_message(struct capi_ctr *ctrl, struct sk_buff *skb)
+ u16 retval = CAPI_NOERROR;
+ unsigned long flags;
+
+- if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_REQ) {
++ spin_lock_irqsave(&card->lock, flags);
++ if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_REQ) {
+ retval = capilib_data_b3_req(&cinfo->ncci_head,
+ CAPIMSG_APPID(skb->data),
+ CAPIMSG_NCCI(skb->data),
+@@ -1027,10 +1036,9 @@ static u16 c4_send_message(struct capi_ctr *ctrl, struct sk_buff *skb)
+ }
+ if (retval == CAPI_NOERROR) {
+ skb_queue_tail(&card->dma->send_queue, skb);
+- spin_lock_irqsave(&card->lock, flags);
+ c4_dispatch_tx(card);
+- spin_unlock_irqrestore(&card->lock, flags);
+ }
++ spin_unlock_irqrestore(&card->lock, flags);
+ return retval;
+ }
+
+diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
+index 765fb75..06f6ec3 100644
+--- a/drivers/net/forcedeth.c
++++ b/drivers/net/forcedeth.c
+@@ -987,7 +987,7 @@ static void nv_enable_irq(struct net_device *dev)
+ if (np->msi_flags & NV_MSI_X_ENABLED)
+ enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector);
+ else
+- enable_irq(dev->irq);
++ enable_irq(np->pci_dev->irq);
+ } else {
+ enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector);
+ enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_TX].vector);
+@@ -1003,7 +1003,7 @@ static void nv_disable_irq(struct net_device *dev)
+ if (np->msi_flags & NV_MSI_X_ENABLED)
+ disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector);
+ else
+- disable_irq(dev->irq);
++ disable_irq(np->pci_dev->irq);
+ } else {
+ disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector);
+ disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_TX].vector);
+@@ -1600,7 +1600,7 @@ static void nv_do_rx_refill(unsigned long data)
+ if (np->msi_flags & NV_MSI_X_ENABLED)
+ disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector);
+ else
+- disable_irq(dev->irq);
++ disable_irq(np->pci_dev->irq);
+ } else {
+ disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector);
+ }
+@@ -1618,7 +1618,7 @@ static void nv_do_rx_refill(unsigned long data)
+ if (np->msi_flags & NV_MSI_X_ENABLED)
+ enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector);
+ else
+- enable_irq(dev->irq);
++ enable_irq(np->pci_dev->irq);
+ } else {
+ enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector);
+ }
+@@ -3556,10 +3556,12 @@ static int nv_request_irq(struct net_device *dev, int intr_test)
+ if (ret != 0 && np->msi_flags & NV_MSI_CAPABLE) {
+ if ((ret = pci_enable_msi(np->pci_dev)) == 0) {
+ np->msi_flags |= NV_MSI_ENABLED;
++ dev->irq = np->pci_dev->irq;
+ if (request_irq(np->pci_dev->irq, handler, IRQF_SHARED, dev->name, dev) != 0) {
+ printk(KERN_INFO "forcedeth: request_irq failed %d\n", ret);
+ pci_disable_msi(np->pci_dev);
+ np->msi_flags &= ~NV_MSI_ENABLED;
++ dev->irq = np->pci_dev->irq;
+ goto out_err;
+ }
+
+@@ -3622,7 +3624,7 @@ static void nv_do_nic_poll(unsigned long data)
+ if (np->msi_flags & NV_MSI_X_ENABLED)
+ disable_irq_lockdep(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector);
+ else
+- disable_irq_lockdep(dev->irq);
++ disable_irq_lockdep(np->pci_dev->irq);
+ mask = np->irqmask;
+ } else {
+ if (np->nic_poll_irq & NVREG_IRQ_RX_ALL) {
+@@ -3640,6 +3642,8 @@ static void nv_do_nic_poll(unsigned long data)
+ }
+ np->nic_poll_irq = 0;
+
++ /* disable_irq() contains synchronize_irq, thus no irq handler can run now */
++
+ if (np->recover_error) {
+ np->recover_error = 0;
+ printk(KERN_INFO "forcedeth: MAC in recoverable error state\n");
+@@ -3676,7 +3680,6 @@ static void nv_do_nic_poll(unsigned long data)
+ }
+ }
+
+- /* FIXME: Do we need synchronize_irq(dev->irq) here? */
+
+ writel(mask, base + NvRegIrqMask);
+ pci_push(base);
+@@ -3689,7 +3692,7 @@ static void nv_do_nic_poll(unsigned long data)
+ if (np->msi_flags & NV_MSI_X_ENABLED)
+ enable_irq_lockdep(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector);
+ else
+- enable_irq_lockdep(dev->irq);
++ enable_irq_lockdep(np->pci_dev->irq);
+ } else {
+ if (np->nic_poll_irq & NVREG_IRQ_RX_ALL) {
+ nv_nic_irq_rx(0, dev);
+@@ -4943,7 +4946,7 @@ static int nv_close(struct net_device *dev)
+ np->in_shutdown = 1;
+ spin_unlock_irq(&np->lock);
+ netif_poll_disable(dev);
+- synchronize_irq(dev->irq);
++ synchronize_irq(np->pci_dev->irq);
+
+ del_timer_sync(&np->oom_kick);
+ del_timer_sync(&np->nic_poll);
+diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c
+index bec83cb..7e40105 100644
+--- a/drivers/scsi/hptiop.c
++++ b/drivers/scsi/hptiop.c
+@@ -377,8 +377,9 @@ static void hptiop_host_request_callback(struct hptiop_hba *hba, u32 tag)
+ scp->result = SAM_STAT_CHECK_CONDITION;
+ memset(&scp->sense_buffer,
+ 0, sizeof(scp->sense_buffer));
+- memcpy(&scp->sense_buffer,
+- &req->sg_list, le32_to_cpu(req->dataxfer_length));
++ memcpy(&scp->sense_buffer, &req->sg_list,
++ min(sizeof(scp->sense_buffer),
++ le32_to_cpu(req->dataxfer_length)));
+ break;
+
+ default:
+diff --git a/drivers/usb/core/hcd.h b/drivers/usb/core/hcd.h
+index ef50fa4..87f6467 100644
+--- a/drivers/usb/core/hcd.h
++++ b/drivers/usb/core/hcd.h
+@@ -19,6 +19,8 @@
+
+ #ifdef __KERNEL__
+
++#include <linux/rwsem.h>
++
+ /* This file contains declarations of usbcore internals that are mostly
+ * used or exposed by Host Controller Drivers.
+ */
+@@ -464,5 +466,9 @@ static inline void usbmon_urb_complete(struct usb_bus *bus, struct urb *urb) {}
+ : (in_interrupt () ? "in_interrupt" : "can sleep"))
+
+
+-#endif /* __KERNEL__ */
++/* This rwsem is for use only by the hub driver and ehci-hcd.
++ * Nobody else should touch it.
++ */
++extern struct rw_semaphore ehci_cf_port_reset_rwsem;
+
++#endif /* __KERNEL__ */
+diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
+index a1c1a11..bc93e06 100644
+--- a/drivers/usb/core/hub.c
++++ b/drivers/usb/core/hub.c
+@@ -117,6 +117,12 @@ MODULE_PARM_DESC(use_both_schemes,
+ "try the other device initialization scheme if the "
+ "first one fails");
+
++/* Mutual exclusion for EHCI CF initialization. This interferes with
++ * port reset on some companion controllers.
++ */
++DECLARE_RWSEM(ehci_cf_port_reset_rwsem);
++EXPORT_SYMBOL_GPL(ehci_cf_port_reset_rwsem);
++
+
+ static inline char *portspeed(int portstatus)
+ {
+@@ -1513,6 +1519,11 @@ static int hub_port_reset(struct usb_hub *hub, int port1,
+ {
+ int i, status;
+
++ /* Block EHCI CF initialization during the port reset.
++ * Some companion controllers don't like it when they mix.
++ */
++ down_read(&ehci_cf_port_reset_rwsem);
++
+ /* Reset the port */
+ for (i = 0; i < PORT_RESET_TRIES; i++) {
+ status = set_port_feature(hub->hdev,
+@@ -1543,7 +1554,7 @@ static int hub_port_reset(struct usb_hub *hub, int port1,
+ usb_set_device_state(udev, status
+ ? USB_STATE_NOTATTACHED
+ : USB_STATE_DEFAULT);
+- return status;
++ goto done;
+ }
+
+ dev_dbg (hub->intfdev,
+@@ -1556,6 +1567,8 @@ static int hub_port_reset(struct usb_hub *hub, int port1,
+ "Cannot enable port %i. Maybe the USB cable is bad?\n",
+ port1);
+
++ done:
++ up_read(&ehci_cf_port_reset_rwsem);
+ return status;
+ }
+
+diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
+index 099aff6..5caa8b3 100644
+--- a/drivers/usb/host/ehci-hcd.c
++++ b/drivers/usb/host/ehci-hcd.c
+@@ -566,10 +566,18 @@ static int ehci_run (struct usb_hcd *hcd)
+ * are explicitly handed to companion controller(s), so no TT is
+ * involved with the root hub. (Except where one is integrated,
+ * and there's no companion controller unless maybe for USB OTG.)
++ *
++ * Turning on the CF flag will transfer ownership of all ports
++ * from the companions to the EHCI controller. If any of the
++ * companions are in the middle of a port reset at the time, it
++ * could cause trouble. Write-locking ehci_cf_port_reset_rwsem
++ * guarantees that no resets are in progress.
+ */
++ down_write(&ehci_cf_port_reset_rwsem);
+ hcd->state = HC_STATE_RUNNING;
+ ehci_writel(ehci, FLAG_CF, &ehci->regs->configured_flag);
+ ehci_readl(ehci, &ehci->regs->command); /* unblock posted writes */
++ up_write(&ehci_cf_port_reset_rwsem);
+
+ temp = HC_VERSION(ehci_readl(ehci, &ehci->caps->hc_capbase));
+ ehci_info (ehci,
+diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
+index 4f8282a..c36eb79 100644
+--- a/drivers/usb/serial/generic.c
++++ b/drivers/usb/serial/generic.c
+@@ -190,14 +190,15 @@ int usb_serial_generic_write(struct usb_serial_port *port, const unsigned char *
+
+ /* only do something if we have a bulk out endpoint */
+ if (serial->num_bulk_out) {
+- spin_lock_bh(&port->lock);
++ unsigned long flags;
++ spin_lock_irqsave(&port->lock, flags);
+ if (port->write_urb_busy) {
+- spin_unlock_bh(&port->lock);
++ spin_unlock_irqrestore(&port->lock, flags);
+ dbg("%s - already writing", __FUNCTION__);
+ return 0;
+ }
+ port->write_urb_busy = 1;
+- spin_unlock_bh(&port->lock);
++ spin_unlock_irqrestore(&port->lock, flags);
+
+ count = (count > port->bulk_out_size) ? port->bulk_out_size : count;
+
+diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
+index 0683b51..6f22419 100644
+--- a/drivers/usb/serial/kobil_sct.c
++++ b/drivers/usb/serial/kobil_sct.c
+@@ -82,6 +82,7 @@ static int kobil_tiocmset(struct usb_serial_port *port, struct file *file,
+ unsigned int set, unsigned int clear);
+ static void kobil_read_int_callback( struct urb *urb );
+ static void kobil_write_callback( struct urb *purb );
++static void kobil_set_termios(struct usb_serial_port *port, struct ktermios *old);
+
+
+ static struct usb_device_id id_table [] = {
+@@ -119,6 +120,7 @@ static struct usb_serial_driver kobil_device = {
+ .attach = kobil_startup,
+ .shutdown = kobil_shutdown,
+ .ioctl = kobil_ioctl,
++ .set_termios = kobil_set_termios,
+ .tiocmget = kobil_tiocmget,
+ .tiocmset = kobil_tiocmset,
+ .open = kobil_open,
+@@ -137,7 +139,6 @@ struct kobil_private {
+ int cur_pos; // index of the next char to send in buf
+ __u16 device_type;
+ int line_state;
+- struct ktermios internal_termios;
+ };
+
+
+@@ -216,7 +217,7 @@ static void kobil_shutdown (struct usb_serial *serial)
+
+ static int kobil_open (struct usb_serial_port *port, struct file *filp)
+ {
+- int i, result = 0;
++ int result = 0;
+ struct kobil_private *priv;
+ unsigned char *transfer_buffer;
+ int transfer_buffer_length = 8;
+@@ -242,16 +243,6 @@ static int kobil_open (struct usb_serial_port *port, struct file *filp)
+ port->tty->termios->c_iflag = IGNBRK | IGNPAR | IXOFF;
+ port->tty->termios->c_oflag &= ~ONLCR; // do NOT translate CR to CR-NL (0x0A -> 0x0A 0x0D)
+
+- // set up internal termios structure
+- priv->internal_termios.c_iflag = port->tty->termios->c_iflag;
+- priv->internal_termios.c_oflag = port->tty->termios->c_oflag;
+- priv->internal_termios.c_cflag = port->tty->termios->c_cflag;
+- priv->internal_termios.c_lflag = port->tty->termios->c_lflag;
+-
+- for (i=0; i<NCCS; i++) {
+- priv->internal_termios.c_cc[i] = port->tty->termios->c_cc[i];
+- }
+-
+ // allocate memory for transfer buffer
+ transfer_buffer = kzalloc(transfer_buffer_length, GFP_KERNEL);
+ if (! transfer_buffer) {
+@@ -358,24 +349,26 @@ static void kobil_close (struct usb_serial_port *port, struct file *filp)
+ }
+
+
+-static void kobil_read_int_callback( struct urb *purb)
++static void kobil_read_int_callback(struct urb *urb)
+ {
+ int result;
+- struct usb_serial_port *port = (struct usb_serial_port *) purb->context;
++ struct usb_serial_port *port = urb->context;
+ struct tty_struct *tty;
+- unsigned char *data = purb->transfer_buffer;
++ unsigned char *data = urb->transfer_buffer;
++ int status = urb->status;
+ // char *dbg_data;
+
+ dbg("%s - port %d", __FUNCTION__, port->number);
+
+- if (purb->status) {
+- dbg("%s - port %d Read int status not zero: %d", __FUNCTION__, port->number, purb->status);
++ if (status) {
++ dbg("%s - port %d Read int status not zero: %d",
++ __FUNCTION__, port->number, status);
+ return;
+ }
+-
+- tty = port->tty;
+- if (purb->actual_length) {
+-
++
++ tty = port->tty;
++ if (urb->actual_length) {
++
+ // BEGIN DEBUG
+ /*
+ dbg_data = kzalloc((3 * purb->actual_length + 10) * sizeof(char), GFP_KERNEL);
+@@ -390,15 +383,15 @@ static void kobil_read_int_callback( struct urb *purb)
+ */
+ // END DEBUG
+
+- tty_buffer_request_room(tty, purb->actual_length);
+- tty_insert_flip_string(tty, data, purb->actual_length);
++ tty_buffer_request_room(tty, urb->actual_length);
++ tty_insert_flip_string(tty, data, urb->actual_length);
+ tty_flip_buffer_push(tty);
+ }
+
+ // someone sets the dev to 0 if the close method has been called
+ port->interrupt_in_urb->dev = port->serial->dev;
+
+- result = usb_submit_urb( port->interrupt_in_urb, GFP_ATOMIC );
++ result = usb_submit_urb(port->interrupt_in_urb, GFP_ATOMIC);
+ dbg("%s - port %d Send read URB returns: %i", __FUNCTION__, port->number, result);
+ }
+
+@@ -605,102 +598,79 @@ static int kobil_tiocmset(struct usb_serial_port *port, struct file *file,
+ return (result < 0) ? result : 0;
+ }
+
+-
+-static int kobil_ioctl(struct usb_serial_port *port, struct file *file,
+- unsigned int cmd, unsigned long arg)
++static void kobil_set_termios(struct usb_serial_port *port, struct ktermios *old)
+ {
+ struct kobil_private * priv;
+ int result;
+ unsigned short urb_val = 0;
+- unsigned char *transfer_buffer;
+- int transfer_buffer_length = 8;
+- char *settings;
+- void __user *user_arg = (void __user *)arg;
++ int c_cflag = port->tty->termios->c_cflag;
++ speed_t speed;
++ void * settings;
+
+ priv = usb_get_serial_port_data(port);
+- if ((priv->device_type == KOBIL_USBTWIN_PRODUCT_ID) || (priv->device_type == KOBIL_KAAN_SIM_PRODUCT_ID)) {
++ if (priv->device_type == KOBIL_USBTWIN_PRODUCT_ID || priv->device_type == KOBIL_KAAN_SIM_PRODUCT_ID)
+ // This device doesn't support ioctl calls
+- return 0;
+- }
+-
+- switch (cmd) {
+- case TCGETS: // 0x5401
+- if (!access_ok(VERIFY_WRITE, user_arg, sizeof(struct ktermios))) {
+- dbg("%s - port %d Error in access_ok", __FUNCTION__, port->number);
+- return -EFAULT;
+- }
+- if (kernel_termios_to_user_termios((struct ktermios __user *)arg,
+- &priv->internal_termios))
+- return -EFAULT;
+- return 0;
+-
+- case TCSETS: // 0x5402
+- if (!(port->tty->termios)) {
+- dbg("%s - port %d Error: port->tty->termios is NULL", __FUNCTION__, port->number);
+- return -ENOTTY;
+- }
+- if (!access_ok(VERIFY_READ, user_arg, sizeof(struct ktermios))) {
+- dbg("%s - port %d Error in access_ok", __FUNCTION__, port->number);
+- return -EFAULT;
+- }
+- if (user_termios_to_kernel_termios(&priv->internal_termios,
+- (struct ktermios __user *)arg))
+- return -EFAULT;
+-
+- settings = kzalloc(50, GFP_KERNEL);
+- if (! settings) {
+- return -ENOBUFS;
+- }
++ return;
+
+- switch (priv->internal_termios.c_cflag & CBAUD) {
+- case B1200:
++ switch (speed = tty_get_baud_rate(port->tty)) {
++ case 1200:
+ urb_val = SUSBCR_SBR_1200;
+- strcat(settings, "1200 ");
+ break;
+- case B9600:
++ case 9600:
+ default:
+ urb_val = SUSBCR_SBR_9600;
+- strcat(settings, "9600 ");
+ break;
+- }
++ }
++ urb_val |= (c_cflag & CSTOPB) ? SUSBCR_SPASB_2StopBits : SUSBCR_SPASB_1StopBit;
+
+- urb_val |= (priv->internal_termios.c_cflag & CSTOPB) ? SUSBCR_SPASB_2StopBits : SUSBCR_SPASB_1StopBit;
+- strcat(settings, (priv->internal_termios.c_cflag & CSTOPB) ? "2 StopBits " : "1 StopBit ");
++ settings = kzalloc(50, GFP_KERNEL);
++ if (! settings)
++ return;
+
+- if (priv->internal_termios.c_cflag & PARENB) {
+- if (priv->internal_termios.c_cflag & PARODD) {
+- urb_val |= SUSBCR_SPASB_OddParity;
+- strcat(settings, "Odd Parity");
+- } else {
+- urb_val |= SUSBCR_SPASB_EvenParity;
+- strcat(settings, "Even Parity");
+- }
++ sprintf(settings, "%d ", speed);
++
++ if (c_cflag & PARENB) {
++ if (c_cflag & PARODD) {
++ urb_val |= SUSBCR_SPASB_OddParity;
++ strcat(settings, "Odd Parity");
+ } else {
+- urb_val |= SUSBCR_SPASB_NoParity;
+- strcat(settings, "No Parity");
++ urb_val |= SUSBCR_SPASB_EvenParity;
++ strcat(settings, "Even Parity");
+ }
+- dbg("%s - port %d setting port to: %s", __FUNCTION__, port->number, settings );
++ } else {
++ urb_val |= SUSBCR_SPASB_NoParity;
++ strcat(settings, "No Parity");
++ }
+
+- result = usb_control_msg( port->serial->dev,
+- usb_rcvctrlpipe(port->serial->dev, 0 ),
+- SUSBCRequest_SetBaudRateParityAndStopBits,
+- USB_TYPE_VENDOR | USB_RECIP_ENDPOINT | USB_DIR_OUT,
+- urb_val,
+- 0,
+- settings,
+- 0,
+- KOBIL_TIMEOUT
+- );
++ result = usb_control_msg( port->serial->dev,
++ usb_rcvctrlpipe(port->serial->dev, 0 ),
++ SUSBCRequest_SetBaudRateParityAndStopBits,
++ USB_TYPE_VENDOR | USB_RECIP_ENDPOINT | USB_DIR_OUT,
++ urb_val,
++ 0,
++ settings,
++ 0,
++ KOBIL_TIMEOUT
++ );
++ kfree(settings);
++}
+
+- dbg("%s - port %d Send set_baudrate URB returns: %i", __FUNCTION__, port->number, result);
+- kfree(settings);
++static int kobil_ioctl(struct usb_serial_port *port, struct file * file, unsigned int cmd, unsigned long arg)
++{
++ struct kobil_private * priv = usb_get_serial_port_data(port);
++ unsigned char *transfer_buffer;
++ int transfer_buffer_length = 8;
++ int result;
++
++ if (priv->device_type == KOBIL_USBTWIN_PRODUCT_ID || priv->device_type == KOBIL_KAAN_SIM_PRODUCT_ID)
++ // This device doesn't support ioctl calls
+ return 0;
+
++ switch (cmd) {
+ case TCFLSH: // 0x540B
+ transfer_buffer = kmalloc(transfer_buffer_length, GFP_KERNEL);
+- if (! transfer_buffer) {
++ if (! transfer_buffer)
+ return -ENOBUFS;
+- }
+
+ result = usb_control_msg( port->serial->dev,
+ usb_rcvctrlpipe(port->serial->dev, 0 ),
+@@ -714,15 +684,13 @@ static int kobil_ioctl(struct usb_serial_port *port, struct file *file,
+ );
+
+ dbg("%s - port %d Send reset_all_queues (FLUSH) URB returns: %i", __FUNCTION__, port->number, result);
+-
+ kfree(transfer_buffer);
+- return ((result < 0) ? -EFAULT : 0);
+-
++ return (result < 0) ? -EFAULT : 0;
++ default:
++ return -ENOIOCTLCMD;
+ }
+- return -ENOIOCTLCMD;
+ }
+
+-
+ static int __init kobil_init (void)
+ {
+ int retval;
+diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
+index a480b09..3175288 100644
+--- a/fs/ocfs2/aops.c
++++ b/fs/ocfs2/aops.c
+@@ -661,6 +661,27 @@ static void ocfs2_clear_page_regions(struct page *page,
+ }
+
+ /*
++ * Nonsparse file systems fully allocate before we get to the write
++ * code. This prevents ocfs2_write() from tagging the write as an
++ * allocating one, which means ocfs2_map_page_blocks() might try to
++ * read-in the blocks at the tail of our file. Avoid reading them by
++ * testing i_size against each block offset.
++ */
++static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
++ unsigned int block_start)
++{
++ u64 offset = page_offset(page) + block_start;
++
++ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
++ return 1;
++
++ if (i_size_read(inode) > offset)
++ return 1;
++
++ return 0;
++}
++
++/*
+ * Some of this taken from block_prepare_write(). We already have our
+ * mapping by now though, and the entire write will be allocating or
+ * it won't, so not much need to use BH_New.
+@@ -711,7 +732,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+- (block_start < from || block_end > to)) {
++ ocfs2_should_read_blk(inode, page, block_start) &&
++ (block_start < from || block_end > to)) {
+ ll_rw_block(READ, 1, &bh);
+ *wait_bh++=bh;
+ }
+diff --git a/include/linux/netlink.h b/include/linux/netlink.h
+index 2e23353..b2834d8 100644
+--- a/include/linux/netlink.h
++++ b/include/linux/netlink.h
+@@ -173,7 +173,7 @@ extern int netlink_unregister_notifier(struct notifier_block *nb);
+ /* finegrained unicast helpers: */
+ struct sock *netlink_getsockbyfilp(struct file *filp);
+ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
+- long timeo, struct sock *ssk);
++ long *timeo, struct sock *ssk);
+ void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
+ int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol);
+
+diff --git a/ipc/mqueue.c b/ipc/mqueue.c
+index a242c83..1eef14b 100644
+--- a/ipc/mqueue.c
++++ b/ipc/mqueue.c
+@@ -1014,6 +1014,8 @@ asmlinkage long sys_mq_notify(mqd_t mqdes,
+ return -EINVAL;
+ }
+ if (notification.sigev_notify == SIGEV_THREAD) {
++ long timeo;
++
+ /* create the notify skb */
+ nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
+ ret = -ENOMEM;
+@@ -1042,8 +1044,8 @@ retry:
+ goto out;
+ }
+
+- ret = netlink_attachskb(sock, nc, 0,
+- MAX_SCHEDULE_TIMEOUT, NULL);
++ timeo = MAX_SCHEDULE_TIMEOUT;
++ ret = netlink_attachskb(sock, nc, 0, &timeo, NULL);
+ if (ret == 1)
+ goto retry;
+ if (ret) {
+diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
+index 7e52eb0..589b1e4 100644
+--- a/kernel/futex_compat.c
++++ b/kernel/futex_compat.c
+@@ -29,6 +29,15 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ return 0;
+ }
+
++static void __user *futex_uaddr(struct robust_list *entry,
++ compat_long_t futex_offset)
++{
++ compat_uptr_t base = ptr_to_compat(entry);
++ void __user *uaddr = compat_ptr(base + futex_offset);
++
++ return uaddr;
++}
++
+ /*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+@@ -61,18 +70,23 @@ void compat_exit_robust_list(struct task_struct *curr)
+ if (fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
+ return;
+- if (pending)
+- handle_futex_death((void __user *)pending + futex_offset, curr, pip);
++ if (pending) {
++ void __user *uaddr = futex_uaddr(pending,
++ futex_offset);
++ handle_futex_death(uaddr, curr, pip);
++ }
+
+ while (entry != (struct robust_list __user *) &head->list) {
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+- if (entry != pending)
+- if (handle_futex_death((void __user *)entry + futex_offset,
+- curr, pi))
++ if (entry != pending) {
++ void __user *uaddr = futex_uaddr(entry,
++ futex_offset);
++ if (handle_futex_death(uaddr, curr, pi))
+ return;
++ }
+
+ /*
+ * Fetch the next entry in the list:
+diff --git a/kernel/params.c b/kernel/params.c
+index 8e8ca8f..1f17b58 100644
+--- a/kernel/params.c
++++ b/kernel/params.c
+@@ -591,19 +591,16 @@ static void __init param_sysfs_builtin(void)
+
+ for (i=0; i < __stop___param - __start___param; i++) {
+ char *dot;
+- size_t kplen;
++ size_t max_name_len;
+
+ kp = &__start___param[i];
+- kplen = strlen(kp->name);
++ max_name_len =
++ min_t(size_t, MAX_KBUILD_MODNAME, strlen(kp->name));
+
+- /* We do not handle args without periods. */
+- if (kplen > MAX_KBUILD_MODNAME) {
+- DEBUGP("kernel parameter name is too long: %s\n", kp->name);
+- continue;
+- }
+- dot = memchr(kp->name, '.', kplen);
++ dot = memchr(kp->name, '.', max_name_len);
+ if (!dot) {
+- DEBUGP("couldn't find period in %s\n", kp->name);
++ DEBUGP("couldn't find period in first %d characters "
++ "of %s\n", MAX_KBUILD_MODNAME, kp->name);
+ continue;
+ }
+ name_len = dot - kp->name;
+diff --git a/mm/page-writeback.c b/mm/page-writeback.c
+index eec1481..2d39627 100644
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -674,8 +674,10 @@ retry:
+
+ ret = (*writepage)(page, wbc, data);
+
+- if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
++ if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+ unlock_page(page);
++ ret = 0;
++ }
+ if (ret || (--(wbc->nr_to_write) <= 0))
+ done = 1;
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+diff --git a/mm/shmem.c b/mm/shmem.c
+index b6aae2b..2320b60 100644
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -911,6 +911,21 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+ struct inode *inode;
+
+ BUG_ON(!PageLocked(page));
++ /*
++ * shmem_backing_dev_info's capabilities prevent regular writeback or
++ * sync from ever calling shmem_writepage; but a stacking filesystem
++ * may use the ->writepage of its underlying filesystem, in which case
++ * we want to do nothing when that underlying filesystem is tmpfs
++ * (writing out to swap is useful as a response to memory pressure, but
++ * of no use to stabilize the data) - just redirty the page, unlock it
++ * and claim success in this case. AOP_WRITEPAGE_ACTIVATE, and the
++ * page_mapped check below, must be avoided unless we're in reclaim.
++ */
++ if (!wbc->for_reclaim) {
++ set_page_dirty(page);
++ unlock_page(page);
++ return 0;
++ }
+ BUG_ON(page_mapped(page));
+
+ mapping = page->mapping;
+diff --git a/mm/slub.c b/mm/slub.c
+index e0cf621..648f2c7 100644
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -1431,28 +1431,8 @@ new_slab:
+ page = new_slab(s, gfpflags, node);
+ if (page) {
+ cpu = smp_processor_id();
+- if (s->cpu_slab[cpu]) {
+- /*
+- * Someone else populated the cpu_slab while we
+- * enabled interrupts, or we have gotten scheduled
+- * on another cpu. The page may not be on the
+- * requested node even if __GFP_THISNODE was
+- * specified. So we need to recheck.
+- */
+- if (node == -1 ||
+- page_to_nid(s->cpu_slab[cpu]) == node) {
+- /*
+- * Current cpuslab is acceptable and we
+- * want the current one since its cache hot
+- */
+- discard_slab(s, page);
+- page = s->cpu_slab[cpu];
+- slab_lock(page);
+- goto load_freelist;
+- }
+- /* New slab does not fit our expectations */
++ if (s->cpu_slab[cpu])
+ flush_slab(s, s->cpu_slab[cpu], cpu);
+- }
+ slab_lock(page);
+ SetSlabFrozen(page);
+ s->cpu_slab[cpu] = page;
+diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
+index ab86137..630ebb7 100644
+--- a/net/ipv4/ipcomp.c
++++ b/net/ipv4/ipcomp.c
+@@ -17,6 +17,7 @@
+ #include <asm/scatterlist.h>
+ #include <asm/semaphore.h>
+ #include <linux/crypto.h>
++#include <linux/err.h>
+ #include <linux/pfkeyv2.h>
+ #include <linux/percpu.h>
+ #include <linux/smp.h>
+@@ -355,7 +356,7 @@ static struct crypto_comp **ipcomp_alloc_tfms(const char *alg_name)
+ for_each_possible_cpu(cpu) {
+ struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0,
+ CRYPTO_ALG_ASYNC);
+- if (!tfm)
++ if (IS_ERR(tfm))
+ goto error;
+ *per_cpu_ptr(tfms, cpu) = tfm;
+ }
+diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
+index 1ee50b5..3680f64 100644
+--- a/net/ipv6/ipcomp6.c
++++ b/net/ipv6/ipcomp6.c
+@@ -37,6 +37,7 @@
+ #include <asm/scatterlist.h>
+ #include <asm/semaphore.h>
+ #include <linux/crypto.h>
++#include <linux/err.h>
+ #include <linux/pfkeyv2.h>
+ #include <linux/random.h>
+ #include <linux/percpu.h>
+@@ -366,7 +367,7 @@ static struct crypto_comp **ipcomp6_alloc_tfms(const char *alg_name)
+ for_each_possible_cpu(cpu) {
+ struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0,
+ CRYPTO_ALG_ASYNC);
+- if (!tfm)
++ if (IS_ERR(tfm))
+ goto error;
+ *per_cpu_ptr(tfms, cpu) = tfm;
+ }
+diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
+index ccdd5d2..2721ff4 100644
+--- a/net/netfilter/nf_conntrack_proto_tcp.c
++++ b/net/netfilter/nf_conntrack_proto_tcp.c
+@@ -839,6 +839,22 @@ static int tcp_packet(struct nf_conn *conntrack,
+ new_state = tcp_conntracks[dir][index][old_state];
+
+ switch (new_state) {
++ case TCP_CONNTRACK_SYN_SENT:
++ if (old_state < TCP_CONNTRACK_TIME_WAIT)
++ break;
++ if ((conntrack->proto.tcp.seen[!dir].flags &
++ IP_CT_TCP_FLAG_CLOSE_INIT)
++ || (conntrack->proto.tcp.last_dir == dir
++ && conntrack->proto.tcp.last_index == TCP_RST_SET)) {
++ /* Attempt to reopen a closed/aborted connection.
++ * Delete this connection and look up again. */
++ write_unlock_bh(&tcp_lock);
++ if (del_timer(&conntrack->timeout))
++ conntrack->timeout.function((unsigned long)
++ conntrack);
++ return -NF_REPEAT;
++ }
++ /* Fall through */
+ case TCP_CONNTRACK_IGNORE:
+ /* Ignored packets:
+ *
+@@ -888,27 +904,6 @@ static int tcp_packet(struct nf_conn *conntrack,
+ nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: invalid state ");
+ return -NF_ACCEPT;
+- case TCP_CONNTRACK_SYN_SENT:
+- if (old_state < TCP_CONNTRACK_TIME_WAIT)
+- break;
+- if ((conntrack->proto.tcp.seen[dir].flags &
+- IP_CT_TCP_FLAG_CLOSE_INIT)
+- || after(ntohl(th->seq),
+- conntrack->proto.tcp.seen[dir].td_end)) {
+- /* Attempt to reopen a closed connection.
+- * Delete this connection and look up again. */
+- write_unlock_bh(&tcp_lock);
+- if (del_timer(&conntrack->timeout))
+- conntrack->timeout.function((unsigned long)
+- conntrack);
+- return -NF_REPEAT;
+- } else {
+- write_unlock_bh(&tcp_lock);
+- if (LOG_INVALID(IPPROTO_TCP))
+- nf_log_packet(pf, 0, skb, NULL, NULL,
+- NULL, "nf_ct_tcp: invalid SYN");
+- return -NF_ACCEPT;
+- }
+ case TCP_CONNTRACK_CLOSE:
+ if (index == TCP_RST_SET
+ && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
+@@ -941,6 +936,7 @@ static int tcp_packet(struct nf_conn *conntrack,
+ in_window:
+ /* From now on we have got in-window packets */
+ conntrack->proto.tcp.last_index = index;
++ conntrack->proto.tcp.last_dir = dir;
+
+ DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+ "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
+diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
+index 1f15821..6ac83c2 100644
+--- a/net/netlink/af_netlink.c
++++ b/net/netlink/af_netlink.c
+@@ -732,7 +732,7 @@ struct sock *netlink_getsockbyfilp(struct file *filp)
+ * 1: repeat lookup - reference dropped while waiting for socket memory.
+ */
+ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
+- long timeo, struct sock *ssk)
++ long *timeo, struct sock *ssk)
+ {
+ struct netlink_sock *nlk;
+
+@@ -741,7 +741,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ test_bit(0, &nlk->state)) {
+ DECLARE_WAITQUEUE(wait, current);
+- if (!timeo) {
++ if (!*timeo) {
+ if (!ssk || nlk_sk(ssk)->pid == 0)
+ netlink_overrun(sk);
+ sock_put(sk);
+@@ -755,7 +755,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
+ if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ test_bit(0, &nlk->state)) &&
+ !sock_flag(sk, SOCK_DEAD))
+- timeo = schedule_timeout(timeo);
++ *timeo = schedule_timeout(*timeo);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&nlk->wait, &wait);
+@@ -763,7 +763,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
+
+ if (signal_pending(current)) {
+ kfree_skb(skb);
+- return sock_intr_errno(timeo);
++ return sock_intr_errno(*timeo);
+ }
+ return 1;
+ }
+@@ -827,7 +827,7 @@ retry:
+ kfree_skb(skb);
+ return PTR_ERR(sk);
+ }
+- err = netlink_attachskb(sk, skb, nonblock, timeo, ssk);
++ err = netlink_attachskb(sk, skb, nonblock, &timeo, ssk);
+ if (err == 1)
+ goto retry;
+ if (err)
+diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
+index f2686ea..1d36265 100644
+--- a/net/sched/cls_u32.c
++++ b/net/sched/cls_u32.c
+@@ -107,7 +107,7 @@ static struct tc_u_common *u32_list;
+
+ static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel, u8 fshift)
+ {
+- unsigned h = (key & sel->hmask)>>fshift;
++ unsigned h = ntohl(key & sel->hmask)>>fshift;
+
+ return h;
+ }
+@@ -631,7 +631,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
+ n->handle = handle;
+ {
+ u8 i = 0;
+- u32 mask = s->hmask;
++ u32 mask = ntohl(s->hmask);
+ if (mask) {
+ while (!(mask & 1)) {
+ i++;
+diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
+index f05ad9a..656ccd9 100644
+--- a/net/sched/sch_teql.c
++++ b/net/sched/sch_teql.c
+@@ -263,6 +263,9 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *
+ static __inline__ int
+ teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev)
+ {
++ if (dev->qdisc == &noop_qdisc)
++ return -ENODEV;
++
+ if (dev->hard_header == NULL ||
+ skb->dst == NULL ||
+ skb->dst->neighbour == NULL)
+diff --git a/net/socket.c b/net/socket.c
+index 48bd793..8211578 100644
+--- a/net/socket.c
++++ b/net/socket.c
+@@ -1246,11 +1246,14 @@ asmlinkage long sys_socketpair(int family, int type, int protocol,
+ goto out_release_both;
+
+ fd1 = sock_alloc_fd(&newfile1);
+- if (unlikely(fd1 < 0))
++ if (unlikely(fd1 < 0)) {
++ err = fd1;
+ goto out_release_both;
++ }
+
+ fd2 = sock_alloc_fd(&newfile2);
+ if (unlikely(fd2 < 0)) {
++ err = fd2;
+ put_filp(newfile1);
+ put_unused_fd(fd1);
+ goto out_release_both;
+diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c
+index e3964fc..d5b2f53 100644
+--- a/sound/pci/hda/patch_sigmatel.c
++++ b/sound/pci/hda/patch_sigmatel.c
+@@ -153,8 +153,9 @@ static hda_nid_t stac925x_dac_nids[1] = {
+ 0x02,
+ };
+
+-static hda_nid_t stac925x_dmic_nids[1] = {
+- 0x15,
++#define STAC925X_NUM_DMICS 1
++static hda_nid_t stac925x_dmic_nids[STAC925X_NUM_DMICS + 1] = {
++ 0x15, 0
+ };
+
+ static hda_nid_t stac922x_adc_nids[2] = {
+@@ -181,8 +182,9 @@ static hda_nid_t stac9205_mux_nids[2] = {
+ 0x19, 0x1a
+ };
+
+-static hda_nid_t stac9205_dmic_nids[2] = {
+- 0x17, 0x18,
++#define STAC9205_NUM_DMICS 2
++static hda_nid_t stac9205_dmic_nids[STAC9205_NUM_DMICS + 1] = {
++ 0x17, 0x18, 0
+ };
+
+ static hda_nid_t stac9200_pin_nids[8] = {
+@@ -1972,7 +1974,7 @@ static int patch_stac925x(struct hda_codec *codec)
+ case 0x83847633: /* STAC9202D */
+ case 0x83847636: /* STAC9251 */
+ case 0x83847637: /* STAC9251D */
+- spec->num_dmics = 1;
++ spec->num_dmics = STAC925X_NUM_DMICS;
+ spec->dmic_nids = stac925x_dmic_nids;
+ break;
+ default:
+@@ -2202,7 +2204,7 @@ static int patch_stac9205(struct hda_codec *codec)
+ spec->mux_nids = stac9205_mux_nids;
+ spec->num_muxes = ARRAY_SIZE(stac9205_mux_nids);
+ spec->dmic_nids = stac9205_dmic_nids;
+- spec->num_dmics = ARRAY_SIZE(stac9205_dmic_nids);
++ spec->num_dmics = STAC9205_NUM_DMICS;
+ spec->dmux_nid = 0x1d;
+
+ spec->init = stac9205_core_init;
+diff --git a/sound/pci/rme9652/hdsp.c b/sound/pci/rme9652/hdsp.c
+index 3b3ef65..75dcb9a 100644
+--- a/sound/pci/rme9652/hdsp.c
++++ b/sound/pci/rme9652/hdsp.c
+@@ -3108,6 +3108,9 @@ static int hdsp_dds_offset(struct hdsp *hdsp)
+ unsigned int dds_value = hdsp->dds_value;
+ int system_sample_rate = hdsp->system_sample_rate;
+
++ if (!dds_value)
++ return 0;
++
+ n = DDS_NUMERATOR;
+ /*
+ * dds_value = n / rate
diff --git a/tags/2.6.22-2/01014_linux-2.6.22.15.patch b/tags/2.6.22-2/01014_linux-2.6.22.15.patch
new file mode 100644
index 0000000..320c021
--- /dev/null
+++ b/tags/2.6.22-2/01014_linux-2.6.22.15.patch
@@ -0,0 +1,1096 @@
+Subject: Linux 2.6.22.15
+From: Greg Kroah-Hartman <gregkh@suse.de>
+
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+diff --git a/crypto/algapi.c b/crypto/algapi.c
+index f137a43..ec286a2 100644
+--- a/crypto/algapi.c
++++ b/crypto/algapi.c
+@@ -98,6 +98,9 @@ static void crypto_remove_spawn(struct crypto_spawn *spawn,
+ return;
+
+ inst->alg.cra_flags |= CRYPTO_ALG_DEAD;
++ if (hlist_unhashed(&inst->list))
++ return;
++
+ if (!tmpl || !crypto_tmpl_get(tmpl))
+ return;
+
+@@ -333,9 +336,6 @@ int crypto_register_instance(struct crypto_template *tmpl,
+ LIST_HEAD(list);
+ int err = -EINVAL;
+
+- if (inst->alg.cra_destroy)
+- goto err;
+-
+ err = crypto_check_alg(&inst->alg);
+ if (err)
+ goto err;
+diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
+index 3400b3e..e722f83 100644
+--- a/drivers/ata/ahci.c
++++ b/drivers/ata/ahci.c
+@@ -1241,7 +1241,7 @@ static void ahci_host_intr(struct ata_port *ap)
+ struct ata_eh_info *ehi = &ap->eh_info;
+ struct ahci_port_priv *pp = ap->private_data;
+ u32 status, qc_active;
+- int rc, known_irq = 0;
++ int rc;
+
+ status = readl(port_mmio + PORT_IRQ_STAT);
+ writel(status, port_mmio + PORT_IRQ_STAT);
+@@ -1257,74 +1257,11 @@ static void ahci_host_intr(struct ata_port *ap)
+ qc_active = readl(port_mmio + PORT_CMD_ISSUE);
+
+ rc = ata_qc_complete_multiple(ap, qc_active, NULL);
+- if (rc > 0)
+- return;
+ if (rc < 0) {
+ ehi->err_mask |= AC_ERR_HSM;
+ ehi->action |= ATA_EH_SOFTRESET;
+ ata_port_freeze(ap);
+- return;
+- }
+-
+- /* hmmm... a spurious interupt */
+-
+- /* if !NCQ, ignore. No modern ATA device has broken HSM
+- * implementation for non-NCQ commands.
+- */
+- if (!ap->sactive)
+- return;
+-
+- if (status & PORT_IRQ_D2H_REG_FIS) {
+- if (!pp->ncq_saw_d2h)
+- ata_port_printk(ap, KERN_INFO,
+- "D2H reg with I during NCQ, "
+- "this message won't be printed again\n");
+- pp->ncq_saw_d2h = 1;
+- known_irq = 1;
+- }
+-
+- if (status & PORT_IRQ_DMAS_FIS) {
+- if (!pp->ncq_saw_dmas)
+- ata_port_printk(ap, KERN_INFO,
+- "DMAS FIS during NCQ, "
+- "this message won't be printed again\n");
+- pp->ncq_saw_dmas = 1;
+- known_irq = 1;
+- }
+-
+- if (status & PORT_IRQ_SDB_FIS) {
+- const __le32 *f = pp->rx_fis + RX_FIS_SDB;
+-
+- if (le32_to_cpu(f[1])) {
+- /* SDB FIS containing spurious completions
+- * might be dangerous, whine and fail commands
+- * with HSM violation. EH will turn off NCQ
+- * after several such failures.
+- */
+- ata_ehi_push_desc(ehi,
+- "spurious completions during NCQ "
+- "issue=0x%x SAct=0x%x FIS=%08x:%08x",
+- readl(port_mmio + PORT_CMD_ISSUE),
+- readl(port_mmio + PORT_SCR_ACT),
+- le32_to_cpu(f[0]), le32_to_cpu(f[1]));
+- ehi->err_mask |= AC_ERR_HSM;
+- ehi->action |= ATA_EH_SOFTRESET;
+- ata_port_freeze(ap);
+- } else {
+- if (!pp->ncq_saw_sdb)
+- ata_port_printk(ap, KERN_INFO,
+- "spurious SDB FIS %08x:%08x during NCQ, "
+- "this message won't be printed again\n",
+- le32_to_cpu(f[0]), le32_to_cpu(f[1]));
+- pp->ncq_saw_sdb = 1;
+- }
+- known_irq = 1;
+ }
+-
+- if (!known_irq)
+- ata_port_printk(ap, KERN_INFO, "spurious interrupt "
+- "(irq_stat 0x%x active_tag 0x%x sactive 0x%x)\n",
+- status, ap->active_tag, ap->sactive);
+ }
+
+ static void ahci_irq_clear(struct ata_port *ap)
+diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
+index e6e403f..22b6368 100644
+--- a/drivers/ata/libata-core.c
++++ b/drivers/ata/libata-core.c
+@@ -3785,6 +3785,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
+ /* Devices where NCQ should be avoided */
+ /* NCQ is slow */
+ { "WDC WD740ADFD-00", NULL, ATA_HORKAGE_NONCQ },
++ { "WDC WD740ADFD-00NLR1", NULL, ATA_HORKAGE_NONCQ, },
+ /* http://thread.gmane.org/gmane.linux.ide/14907 */
+ { "FUJITSU MHT2060BH", NULL, ATA_HORKAGE_NONCQ },
+ /* NCQ is broken */
+@@ -3803,15 +3804,6 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
+ { "HTS541060G9SA00", "MB3OC60D", ATA_HORKAGE_NONCQ, },
+ { "HTS541080G9SA00", "MB4OC60D", ATA_HORKAGE_NONCQ, },
+ { "HTS541010G9SA00", "MBZOC60D", ATA_HORKAGE_NONCQ, },
+- /* Drives which do spurious command completion */
+- { "HTS541680J9SA00", "SB2IC7EP", ATA_HORKAGE_NONCQ, },
+- { "HTS541612J9SA00", "SBDIC7JP", ATA_HORKAGE_NONCQ, },
+- { "Hitachi HTS541616J9SA00", "SB4OC70P", ATA_HORKAGE_NONCQ, },
+- { "WDC WD740ADFD-00NLR1", NULL, ATA_HORKAGE_NONCQ, },
+- { "FUJITSU MHV2080BH", "00840028", ATA_HORKAGE_NONCQ, },
+- { "ST9160821AS", "3.CLF", ATA_HORKAGE_NONCQ, },
+- { "ST3160812AS", "3.AD", ATA_HORKAGE_NONCQ, },
+- { "SAMSUNG HD401LJ", "ZZ100-15", ATA_HORKAGE_NONCQ, },
+
+ /* End Marker */
+ { }
+diff --git a/drivers/atm/he.c b/drivers/atm/he.c
+index d33aba6..3b64a99 100644
+--- a/drivers/atm/he.c
++++ b/drivers/atm/he.c
+@@ -394,6 +394,11 @@ he_init_one(struct pci_dev *pci_dev, const struct pci_device_id *pci_ent)
+ he_dev->atm_dev->dev_data = he_dev;
+ atm_dev->dev_data = he_dev;
+ he_dev->number = atm_dev->number;
++#ifdef USE_TASKLET
++ tasklet_init(&he_dev->tasklet, he_tasklet, (unsigned long) he_dev);
++#endif
++ spin_lock_init(&he_dev->global_lock);
++
+ if (he_start(atm_dev)) {
+ he_stop(he_dev);
+ err = -ENODEV;
+@@ -1173,11 +1178,6 @@ he_start(struct atm_dev *dev)
+ if ((err = he_init_irq(he_dev)) != 0)
+ return err;
+
+-#ifdef USE_TASKLET
+- tasklet_init(&he_dev->tasklet, he_tasklet, (unsigned long) he_dev);
+-#endif
+- spin_lock_init(&he_dev->global_lock);
+-
+ /* 4.11 enable pci bus controller state machines */
+ host_cntl |= (OUTFF_ENB | CMDFF_ENB |
+ QUICK_RD_RETRY | QUICK_WR_RETRY | PERR_INT_ENB);
+diff --git a/drivers/block/rd.c b/drivers/block/rd.c
+index a1512da..e30bd9e 100644
+--- a/drivers/block/rd.c
++++ b/drivers/block/rd.c
+@@ -189,6 +189,18 @@ static int ramdisk_set_page_dirty(struct page *page)
+ return 0;
+ }
+
++/*
++ * releasepage is called by pagevec_strip/try_to_release_page if
++ * buffers_heads_over_limit is true. Without a releasepage function
++ * try_to_free_buffers is called instead. That can unset the dirty
++ * bit of our ram disk pages, which will be eventually freed, even
++ * if the page is still in use.
++ */
++static int ramdisk_releasepage(struct page *page, gfp_t dummy)
++{
++ return 0;
++}
++
+ static const struct address_space_operations ramdisk_aops = {
+ .readpage = ramdisk_readpage,
+ .prepare_write = ramdisk_prepare_write,
+@@ -196,6 +208,7 @@ static const struct address_space_operations ramdisk_aops = {
+ .writepage = ramdisk_writepage,
+ .set_page_dirty = ramdisk_set_page_dirty,
+ .writepages = ramdisk_writepages,
++ .releasepage = ramdisk_releasepage,
+ };
+
+ static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec, sector_t sector,
+diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
+index c97330b..eb9a247 100644
+--- a/drivers/isdn/i4l/isdn_common.c
++++ b/drivers/isdn/i4l/isdn_common.c
+@@ -1514,6 +1514,7 @@ isdn_ioctl(struct inode *inode, struct file *file, uint cmd, ulong arg)
+ if (copy_from_user(&iocts, argp,
+ sizeof(isdn_ioctl_struct)))
+ return -EFAULT;
++ iocts.drvid[sizeof(iocts.drvid)-1] = 0;
+ if (strlen(iocts.drvid)) {
+ if ((p = strchr(iocts.drvid, ',')))
+ *p = 0;
+@@ -1598,6 +1599,7 @@ isdn_ioctl(struct inode *inode, struct file *file, uint cmd, ulong arg)
+ if (copy_from_user(&iocts, argp,
+ sizeof(isdn_ioctl_struct)))
+ return -EFAULT;
++ iocts.drvid[sizeof(iocts.drvid)-1] = 0;
+ if (strlen(iocts.drvid)) {
+ drvidx = -1;
+ for (i = 0; i < ISDN_MAX_DRIVERS; i++)
+@@ -1642,7 +1644,7 @@ isdn_ioctl(struct inode *inode, struct file *file, uint cmd, ulong arg)
+ } else {
+ p = (char __user *) iocts.arg;
+ for (i = 0; i < 10; i++) {
+- sprintf(bname, "%s%s",
++ snprintf(bname, sizeof(bname), "%s%s",
+ strlen(dev->drv[drvidx]->msn2eaz[i]) ?
+ dev->drv[drvidx]->msn2eaz[i] : "_",
+ (i < 9) ? "," : "\0");
+@@ -1672,6 +1674,7 @@ isdn_ioctl(struct inode *inode, struct file *file, uint cmd, ulong arg)
+ char *p;
+ if (copy_from_user(&iocts, argp, sizeof(isdn_ioctl_struct)))
+ return -EFAULT;
++ iocts.drvid[sizeof(iocts.drvid)-1] = 0;
+ if (strlen(iocts.drvid)) {
+ if ((p = strchr(iocts.drvid, ',')))
+ *p = 0;
+diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
+index aa83277..75e1423 100644
+--- a/drivers/isdn/i4l/isdn_net.c
++++ b/drivers/isdn/i4l/isdn_net.c
+@@ -2126,7 +2126,7 @@ isdn_net_find_icall(int di, int ch, int idx, setup_parm *setup)
+ u_long flags;
+ isdn_net_dev *p;
+ isdn_net_phone *n;
+- char nr[32];
++ char nr[ISDN_MSNLEN];
+ char *my_eaz;
+
+ /* Search name in netdev-chain */
+@@ -2135,7 +2135,7 @@ isdn_net_find_icall(int di, int ch, int idx, setup_parm *setup)
+ nr[1] = '\0';
+ printk(KERN_INFO "isdn_net: Incoming call without OAD, assuming '0'\n");
+ } else
+- strcpy(nr, setup->phone);
++ strlcpy(nr, setup->phone, ISDN_MSNLEN);
+ si1 = (int) setup->si1;
+ si2 = (int) setup->si2;
+ if (!setup->eazmsn[0]) {
+@@ -2802,7 +2802,7 @@ isdn_net_setcfg(isdn_net_ioctl_cfg * cfg)
+ chidx = -1;
+ }
+ }
+- strcpy(lp->msn, cfg->eaz);
++ strlcpy(lp->msn, cfg->eaz, sizeof(lp->msn));
+ lp->pre_device = drvidx;
+ lp->pre_channel = chidx;
+ lp->onhtime = cfg->onhtime;
+@@ -2951,7 +2951,7 @@ isdn_net_addphone(isdn_net_ioctl_phone * phone)
+ if (p) {
+ if (!(n = kmalloc(sizeof(isdn_net_phone), GFP_KERNEL)))
+ return -ENOMEM;
+- strcpy(n->num, phone->phone);
++ strlcpy(n->num, phone->phone, sizeof(n->num));
+ n->next = p->local->phone[phone->outgoing & 1];
+ p->local->phone[phone->outgoing & 1] = n;
+ return 0;
+diff --git a/drivers/net/atl1/atl1_main.c b/drivers/net/atl1/atl1_main.c
+index 6862c11..1b7a5a8 100644
+--- a/drivers/net/atl1/atl1_main.c
++++ b/drivers/net/atl1/atl1_main.c
+@@ -2097,21 +2097,26 @@ static int __devinit atl1_probe(struct pci_dev *pdev,
+ struct net_device *netdev;
+ struct atl1_adapter *adapter;
+ static int cards_found = 0;
+- bool pci_using_64 = true;
+ int err;
+
+ err = pci_enable_device(pdev);
+ if (err)
+ return err;
+
+- err = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
++ /*
++ * The atl1 chip can DMA to 64-bit addresses, but it uses a single
++ * shared register for the high 32 bits, so only a single, aligned,
++ * 4 GB physical address range can be used at a time.
++ *
++ * Supporting 64-bit DMA on this hardware is more trouble than it's
++ * worth. It is far easier to limit to 32-bit DMA than update
++ * various kernel subsystems to support the mechanics required by a
++ * fixed-high-32-bit system.
++ */
++ err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+ if (err) {
+- err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+- if (err) {
+- dev_err(&pdev->dev, "no usable DMA configuration\n");
+- goto err_dma;
+- }
+- pci_using_64 = false;
++ dev_err(&pdev->dev, "no usable DMA configuration\n");
++ goto err_dma;
+ }
+ /* Mark all PCI regions associated with PCI device
+ * pdev as being reserved by owner atl1_driver_name
+@@ -2176,7 +2181,6 @@ static int __devinit atl1_probe(struct pci_dev *pdev,
+
+ netdev->ethtool_ops = &atl1_ethtool_ops;
+ adapter->bd_number = cards_found;
+- adapter->pci_using_64 = pci_using_64;
+
+ /* setup the private structure */
+ err = atl1_sw_init(adapter);
+@@ -2193,9 +2197,6 @@ static int __devinit atl1_probe(struct pci_dev *pdev,
+ */
+ /* netdev->features |= NETIF_F_TSO; */
+
+- if (pci_using_64)
+- netdev->features |= NETIF_F_HIGHDMA;
+-
+ netdev->features |= NETIF_F_LLTX;
+
+ /*
+diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
+index 06f6ec3..36b3a66 100644
+--- a/drivers/net/forcedeth.c
++++ b/drivers/net/forcedeth.c
+@@ -5283,19 +5283,15 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
+ if (readl(base + NvRegTransmitterControl) & NVREG_XMITCTL_SYNC_PHY_INIT) {
+ np->mac_in_use = readl(base + NvRegTransmitterControl) & NVREG_XMITCTL_MGMT_ST;
+ dprintk(KERN_INFO "%s: mgmt unit is running. mac in use %x.\n", pci_name(pci_dev), np->mac_in_use);
+- for (i = 0; i < 5000; i++) {
+- msleep(1);
+- if (nv_mgmt_acquire_sema(dev)) {
+- /* management unit setup the phy already? */
+- if ((readl(base + NvRegTransmitterControl) & NVREG_XMITCTL_SYNC_MASK) ==
+- NVREG_XMITCTL_SYNC_PHY_INIT) {
+- /* phy is inited by mgmt unit */
+- phyinitialized = 1;
+- dprintk(KERN_INFO "%s: Phy already initialized by mgmt unit.\n", pci_name(pci_dev));
+- } else {
+- /* we need to init the phy */
+- }
+- break;
++ if (nv_mgmt_acquire_sema(dev)) {
++ /* management unit setup the phy already? */
++ if ((readl(base + NvRegTransmitterControl) & NVREG_XMITCTL_SYNC_MASK) ==
++ NVREG_XMITCTL_SYNC_PHY_INIT) {
++ /* phy is inited by mgmt unit */
++ phyinitialized = 1;
++ dprintk(KERN_INFO "%s: Phy already initialized by mgmt unit.\n", pci_name(pci_dev));
++ } else {
++ /* we need to init the phy */
+ }
+ }
+ }
+@@ -5553,6 +5549,22 @@ static struct pci_device_id pci_tbl[] = {
+ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_27),
+ .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_HIGH_DMA|DEV_HAS_POWER_CNTRL|DEV_HAS_MSI|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT,
+ },
++ { /* MCP79 Ethernet Controller */
++ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_36),
++ .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT,
++ },
++ { /* MCP79 Ethernet Controller */
++ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_37),
++ .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT,
++ },
++ { /* MCP79 Ethernet Controller */
++ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_38),
++ .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT,
++ },
++ { /* MCP79 Ethernet Controller */
++ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_39),
++ .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT,
++ },
+ {0,},
+ };
+
+diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
+index 5caa8b3..ba78f8e 100644
+--- a/drivers/usb/host/ehci-hcd.c
++++ b/drivers/usb/host/ehci-hcd.c
+@@ -571,12 +571,15 @@ static int ehci_run (struct usb_hcd *hcd)
+ * from the companions to the EHCI controller. If any of the
+ * companions are in the middle of a port reset at the time, it
+ * could cause trouble. Write-locking ehci_cf_port_reset_rwsem
+- * guarantees that no resets are in progress.
++ * guarantees that no resets are in progress. After we set CF,
++ * a short delay lets the hardware catch up; new resets shouldn't
++ * be started before the port switching actions could complete.
+ */
+ down_write(&ehci_cf_port_reset_rwsem);
+ hcd->state = HC_STATE_RUNNING;
+ ehci_writel(ehci, FLAG_CF, &ehci->regs->configured_flag);
+ ehci_readl(ehci, &ehci->regs->command); /* unblock posted writes */
++ msleep(5);
+ up_write(&ehci_cf_port_reset_rwsem);
+
+ temp = HC_VERSION(ehci_readl(ehci, &ehci->caps->hc_capbase));
+diff --git a/drivers/usb/image/microtek.c b/drivers/usb/image/microtek.c
+index 51bd80d..3acfd1a 100644
+--- a/drivers/usb/image/microtek.c
++++ b/drivers/usb/image/microtek.c
+@@ -823,7 +823,7 @@ static int mts_usb_probe(struct usb_interface *intf,
+ goto out_kfree2;
+
+ new_desc->host->hostdata[0] = (unsigned long)new_desc;
+- if (scsi_add_host(new_desc->host, NULL)) {
++ if (scsi_add_host(new_desc->host, &dev->dev)) {
+ err_retval = -EIO;
+ goto out_host_put;
+ }
+diff --git a/drivers/video/fb_ddc.c b/drivers/video/fb_ddc.c
+index f836137..a0df632 100644
+--- a/drivers/video/fb_ddc.c
++++ b/drivers/video/fb_ddc.c
+@@ -56,13 +56,12 @@ unsigned char *fb_ddc_read(struct i2c_adapter *adapter)
+ int i, j;
+
+ algo_data->setscl(algo_data->data, 1);
+- algo_data->setscl(algo_data->data, 0);
+
+ for (i = 0; i < 3; i++) {
+ /* For some old monitors we need the
+ * following process to initialize/stop DDC
+ */
+- algo_data->setsda(algo_data->data, 0);
++ algo_data->setsda(algo_data->data, 1);
+ msleep(13);
+
+ algo_data->setscl(algo_data->data, 1);
+@@ -97,14 +96,15 @@ unsigned char *fb_ddc_read(struct i2c_adapter *adapter)
+ algo_data->setsda(algo_data->data, 1);
+ msleep(15);
+ algo_data->setscl(algo_data->data, 0);
++ algo_data->setsda(algo_data->data, 0);
+ if (edid)
+ break;
+ }
+ /* Release the DDC lines when done or the Apple Cinema HD display
+ * will switch off
+ */
+- algo_data->setsda(algo_data->data, 0);
+- algo_data->setscl(algo_data->data, 0);
++ algo_data->setsda(algo_data->data, 1);
++ algo_data->setscl(algo_data->data, 1);
+
+ return edid;
+ }
+diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
+index 6ca2d24..f83d235 100644
+--- a/fs/nfsd/nfsfh.c
++++ b/fs/nfsd/nfsfh.c
+@@ -565,13 +565,23 @@ enum fsid_source fsid_source(struct svc_fh *fhp)
+ case FSID_DEV:
+ case FSID_ENCODE_DEV:
+ case FSID_MAJOR_MINOR:
+- return FSIDSOURCE_DEV;
++ if (fhp->fh_export->ex_dentry->d_inode->i_sb->s_type->fs_flags
++ & FS_REQUIRES_DEV)
++ return FSIDSOURCE_DEV;
++ break;
+ case FSID_NUM:
+- return FSIDSOURCE_FSID;
+- default:
+ if (fhp->fh_export->ex_flags & NFSEXP_FSID)
+ return FSIDSOURCE_FSID;
+- else
+- return FSIDSOURCE_UUID;
++ break;
++ default:
++ break;
+ }
++ /* either a UUID type filehandle, or the filehandle doesn't
++ * match the export.
++ */
++ if (fhp->fh_export->ex_flags & NFSEXP_FSID)
++ return FSIDSOURCE_FSID;
++ if (fhp->fh_export->ex_uuid)
++ return FSIDSOURCE_UUID;
++ return FSIDSOURCE_DEV;
+ }
+diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
+index c1ffa1b..887c2ce 100644
+--- a/include/linux/pci_ids.h
++++ b/include/linux/pci_ids.h
+@@ -1239,6 +1239,10 @@
+ #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE 0x0560
+ #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE 0x056C
+ #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE 0x0759
++#define PCI_DEVICE_ID_NVIDIA_NVENET_36 0x0AB0
++#define PCI_DEVICE_ID_NVIDIA_NVENET_37 0x0AB1
++#define PCI_DEVICE_ID_NVIDIA_NVENET_38 0x0AB2
++#define PCI_DEVICE_ID_NVIDIA_NVENET_39 0x0AB3
+
+ #define PCI_VENDOR_ID_IMS 0x10e0
+ #define PCI_DEVICE_ID_IMS_TT128 0x9128
+diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
+index 1c4eb41..9c4ad75 100644
+--- a/include/linux/thread_info.h
++++ b/include/linux/thread_info.h
+@@ -7,12 +7,25 @@
+ #ifndef _LINUX_THREAD_INFO_H
+ #define _LINUX_THREAD_INFO_H
+
++#include <linux/types.h>
++
+ /*
+- * System call restart block.
++ * System call restart block.
+ */
+ struct restart_block {
+ long (*fn)(struct restart_block *);
+- unsigned long arg0, arg1, arg2, arg3;
++ union {
++ struct {
++ unsigned long arg0, arg1, arg2, arg3;
++ };
++ /* For futex_wait */
++ struct {
++ u32 *uaddr;
++ u32 val;
++ u32 flags;
++ u64 time;
++ } futex;
++ };
+ };
+
+ extern long do_no_restart_syscall(struct restart_block *parm);
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index a99b4f6..c05e018 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -1258,6 +1258,9 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new,
+ struct sock *sk)
+ {
+ __skb_insert(new, skb->prev, skb, &sk->sk_write_queue);
++
++ if (sk->sk_send_head == skb)
++ sk->sk_send_head = new;
+ }
+
+ static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
+diff --git a/kernel/exit.c b/kernel/exit.c
+index e3adc46..369dae2 100644
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -1339,7 +1339,7 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
+ if (unlikely(!exit_code) || unlikely(p->exit_state))
+ goto bail_ref;
+ return wait_noreap_copyout(p, pid, uid,
+- why, (exit_code << 8) | 0x7f,
++ why, exit_code,
+ infop, ru);
+ }
+
+diff --git a/kernel/futex.c b/kernel/futex.c
+index 9b57f7e..592cf07 100644
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1129,9 +1129,9 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+
+ /*
+ * In case we must use restart_block to restart a futex_wait,
+- * we encode in the 'arg3' shared capability
++ * we encode in the 'flags' shared capability
+ */
+-#define ARG3_SHARED 1
++#define FLAGS_SHARED 1
+
+ static long futex_wait_restart(struct restart_block *restart);
+ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+@@ -1272,12 +1272,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+ struct restart_block *restart;
+ restart = &current_thread_info()->restart_block;
+ restart->fn = futex_wait_restart;
+- restart->arg0 = (unsigned long)uaddr;
+- restart->arg1 = (unsigned long)val;
+- restart->arg2 = (unsigned long)abs_time;
+- restart->arg3 = 0;
++ restart->futex.uaddr = (u32 *)uaddr;
++ restart->futex.val = val;
++ restart->futex.time = abs_time->tv64;
++ restart->futex.flags = 0;
++
+ if (fshared)
+- restart->arg3 |= ARG3_SHARED;
++ restart->futex.flags |= FLAGS_SHARED;
+ return -ERESTART_RESTARTBLOCK;
+ }
+
+@@ -1293,15 +1294,15 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+
+ static long futex_wait_restart(struct restart_block *restart)
+ {
+- u32 __user *uaddr = (u32 __user *)restart->arg0;
+- u32 val = (u32)restart->arg1;
+- ktime_t *abs_time = (ktime_t *)restart->arg2;
++ u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
+ struct rw_semaphore *fshared = NULL;
++ ktime_t t;
+
++ t.tv64 = restart->futex.time;
+ restart->fn = do_no_restart_syscall;
+- if (restart->arg3 & ARG3_SHARED)
++ if (restart->futex.flags & FLAGS_SHARED)
+ fshared = &current->mm->mmap_sem;
+- return (long)futex_wait(uaddr, fshared, val, abs_time);
++ return (long)futex_wait(uaddr, fshared, restart->futex.val, &t);
+ }
+
+
+diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
+index 23c03f4..355e867 100644
+--- a/kernel/hrtimer.c
++++ b/kernel/hrtimer.c
+@@ -825,6 +825,14 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+ #ifdef CONFIG_TIME_LOW_RES
+ tim = ktime_add(tim, base->resolution);
+ #endif
++ /*
++ * Careful here: User space might have asked for a
++ * very long sleep, so the add above might result in a
++ * negative number, which enqueues the timer in front
++ * of the queue.
++ */
++ if (tim.tv64 < 0)
++ tim.tv64 = KTIME_MAX;
+ }
+ timer->expires = tim;
+
+diff --git a/kernel/sys.c b/kernel/sys.c
+index afd9b93..28e8364 100644
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -31,7 +31,6 @@
+ #include <linux/cn_proc.h>
+ #include <linux/getcpu.h>
+ #include <linux/task_io_accounting_ops.h>
+-#include <linux/cpu.h>
+
+ #include <linux/compat.h>
+ #include <linux/syscalls.h>
+@@ -866,7 +865,6 @@ EXPORT_SYMBOL_GPL(kernel_halt);
+ void kernel_power_off(void)
+ {
+ kernel_shutdown_prepare(SYSTEM_POWER_OFF);
+- disable_nonboot_cpus();
+ printk(KERN_EMERG "Power down.\n");
+ machine_power_off();
+ }
+diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
+index 60f4680..1f3a52e 100644
+--- a/lib/libcrc32c.c
++++ b/lib/libcrc32c.c
+@@ -33,7 +33,6 @@
+ #include <linux/crc32c.h>
+ #include <linux/compiler.h>
+ #include <linux/module.h>
+-#include <asm/byteorder.h>
+
+ MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>");
+ MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations");
+@@ -161,15 +160,13 @@ static const u32 crc32c_table[256] = {
+ */
+
+ u32 __attribute_pure__
+-crc32c_le(u32 seed, unsigned char const *data, size_t length)
++crc32c_le(u32 crc, unsigned char const *data, size_t length)
+ {
+- u32 crc = __cpu_to_le32(seed);
+-
+ while (length--)
+ crc =
+ crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8);
+
+- return __le32_to_cpu(crc);
++ return crc;
+ }
+
+ #endif /* CRC_LE_BITS == 8 */
+diff --git a/lib/textsearch.c b/lib/textsearch.c
+index 88c98a2..be8bda3 100644
+--- a/lib/textsearch.c
++++ b/lib/textsearch.c
+@@ -7,7 +7,7 @@
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+- * Pablo Neira Ayuso <pablo@eurodev.net>
++ * Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * ==========================================================================
+ *
+@@ -250,7 +250,8 @@ unsigned int textsearch_find_continuous(struct ts_config *conf,
+ * the various search algorithms.
+ *
+ * Returns a new textsearch configuration according to the specified
+- * parameters or a ERR_PTR().
++ * parameters or a ERR_PTR(). If a zero length pattern is passed, this
++ * function returns EINVAL.
+ */
+ struct ts_config *textsearch_prepare(const char *algo, const void *pattern,
+ unsigned int len, gfp_t gfp_mask, int flags)
+@@ -259,6 +260,9 @@ struct ts_config *textsearch_prepare(const char *algo, const void *pattern,
+ struct ts_config *conf;
+ struct ts_ops *ops;
+
++ if (len == 0)
++ return ERR_PTR(-EINVAL);
++
+ ops = lookup_ts_algo(algo);
+ #ifdef CONFIG_KMOD
+ /*
+diff --git a/mm/shmem.c b/mm/shmem.c
+index 2320b60..d1c65fb 100644
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -1066,7 +1066,7 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
+ pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+ pvma.vm_pgoff = idx;
+ pvma.vm_end = PAGE_SIZE;
+- page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
++ page = alloc_page_vma(gfp, &pvma, 0);
+ mpol_free(pvma.vm_policy);
+ return page;
+ }
+@@ -1086,7 +1086,7 @@ shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
+ static inline struct page *
+ shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
+ {
+- return alloc_page(gfp | __GFP_ZERO);
++ return alloc_page(gfp);
+ }
+ #endif
+
+@@ -1295,6 +1295,7 @@ repeat:
+
+ info->alloced++;
+ spin_unlock(&info->lock);
++ clear_highpage(filepage);
+ flush_dcache_page(filepage);
+ SetPageUptodate(filepage);
+ }
+diff --git a/net/bridge/br.c b/net/bridge/br.c
+index 848b8fa..94ae4d2 100644
+--- a/net/bridge/br.c
++++ b/net/bridge/br.c
+@@ -39,7 +39,7 @@ static int __init br_init(void)
+
+ err = br_fdb_init();
+ if (err)
+- goto err_out1;
++ goto err_out;
+
+ err = br_netfilter_init();
+ if (err)
+@@ -65,6 +65,8 @@ err_out3:
+ err_out2:
+ br_netfilter_fini();
+ err_out1:
++ br_fdb_fini();
++err_out:
+ llc_sap_put(br_stp_sap);
+ return err;
+ }
+diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
+index 420bbb9..fb2c7cc 100644
+--- a/net/bridge/br_input.c
++++ b/net/bridge/br_input.c
+@@ -127,6 +127,7 @@ static inline int is_link_local(const unsigned char *dest)
+ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
+ {
+ const unsigned char *dest = eth_hdr(skb)->h_dest;
++ int (*rhook)(struct sk_buff **pskb);
+
+ if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
+ goto drop;
+@@ -148,9 +149,9 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
+
+ switch (p->state) {
+ case BR_STATE_FORWARDING:
+-
+- if (br_should_route_hook) {
+- if (br_should_route_hook(&skb))
++ rhook = rcu_dereference(br_should_route_hook);
++ if (rhook != NULL) {
++ if (rhook(&skb))
+ return skb;
+ dest = eth_hdr(skb)->h_dest;
+ }
+diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
+index d37ce04..bc17cf5 100644
+--- a/net/bridge/netfilter/ebtable_broute.c
++++ b/net/bridge/netfilter/ebtable_broute.c
+@@ -70,13 +70,13 @@ static int __init ebtable_broute_init(void)
+ if (ret < 0)
+ return ret;
+ /* see br_input.c */
+- br_should_route_hook = ebt_broute;
++ rcu_assign_pointer(br_should_route_hook, ebt_broute);
+ return ret;
+ }
+
+ static void __exit ebtable_broute_fini(void)
+ {
+- br_should_route_hook = NULL;
++ rcu_assign_pointer(br_should_route_hook, NULL);
+ synchronize_net();
+ ebt_unregister_table(&broute_table);
+ }
+diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
+index d46e453..b51ee15 100644
+--- a/net/decnet/dn_dev.c
++++ b/net/decnet/dn_dev.c
+@@ -651,16 +651,18 @@ static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+ struct dn_dev *dn_db;
+ struct ifaddrmsg *ifm;
+ struct dn_ifaddr *ifa, **ifap;
+- int err = -EADDRNOTAVAIL;
++ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy);
+ if (err < 0)
+ goto errout;
+
++ err = -ENODEV;
+ ifm = nlmsg_data(nlh);
+ if ((dn_db = dn_dev_by_index(ifm->ifa_index)) == NULL)
+ goto errout;
+
++ err = -EADDRNOTAVAIL;
+ for (ifap = &dn_db->ifa_list; (ifa = *ifap); ifap = &ifa->ifa_next) {
+ if (tb[IFA_LOCAL] &&
+ nla_memcmp(tb[IFA_LOCAL], &ifa->ifa_local, 2))
+diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
+index e00767e..84097ee 100644
+--- a/net/ipv4/arp.c
++++ b/net/ipv4/arp.c
+@@ -110,12 +110,8 @@
+ #include <net/tcp.h>
+ #include <net/sock.h>
+ #include <net/arp.h>
+-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ #include <net/ax25.h>
+-#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+ #include <net/netrom.h>
+-#endif
+-#endif
+ #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+ #include <net/atmclip.h>
+ struct neigh_table *clip_tbl_hook;
+@@ -729,20 +725,10 @@ static int arp_process(struct sk_buff *skb)
+ htons(dev_type) != arp->ar_hrd)
+ goto out;
+ break;
+-#ifdef CONFIG_NET_ETHERNET
+ case ARPHRD_ETHER:
+-#endif
+-#ifdef CONFIG_TR
+ case ARPHRD_IEEE802_TR:
+-#endif
+-#ifdef CONFIG_FDDI
+ case ARPHRD_FDDI:
+-#endif
+-#ifdef CONFIG_NET_FC
+ case ARPHRD_IEEE802:
+-#endif
+-#if defined(CONFIG_NET_ETHERNET) || defined(CONFIG_TR) || \
+- defined(CONFIG_FDDI) || defined(CONFIG_NET_FC)
+ /*
+ * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
+ * devices, according to RFC 2625) devices will accept ARP
+@@ -757,21 +743,16 @@ static int arp_process(struct sk_buff *skb)
+ arp->ar_pro != htons(ETH_P_IP))
+ goto out;
+ break;
+-#endif
+-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ case ARPHRD_AX25:
+ if (arp->ar_pro != htons(AX25_P_IP) ||
+ arp->ar_hrd != htons(ARPHRD_AX25))
+ goto out;
+ break;
+-#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+ case ARPHRD_NETROM:
+ if (arp->ar_pro != htons(AX25_P_IP) ||
+ arp->ar_hrd != htons(ARPHRD_NETROM))
+ goto out;
+ break;
+-#endif
+-#endif
+ }
+
+ /* Understand only these message types */
+diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
+index ea02f00..3b01a5f 100644
+--- a/net/ipv4/netfilter/nf_nat_core.c
++++ b/net/ipv4/netfilter/nf_nat_core.c
+@@ -633,7 +633,7 @@ static int clean_nat(struct nf_conn *i, void *data)
+
+ if (!nat)
+ return 0;
+- memset(nat, 0, sizeof(nat));
++ memset(nat, 0, sizeof(*nat));
+ i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
+ return 0;
+ }
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index 29ca63e..4aa2551 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -3150,18 +3150,14 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
+ offset /= sizeof(u32);
+
+ if (length > 0) {
+- u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
+ u32 *dst = (u32 *) buffer;
+
+- /* Copy first cpu. */
+ *start = buffer;
+- memcpy(dst, src, length);
++ memset(dst, 0, length);
+
+- /* Add the other cpus in, one int at a time */
+ for_each_possible_cpu(i) {
+ unsigned int j;
+-
+- src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
++ u32 *src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
+
+ for (j = 0; j < length/4; j++)
+ dst[j] += src[j];
+diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
+index 53ef0f4..6ea1306 100644
+--- a/net/ipv4/sysctl_net_ipv4.c
++++ b/net/ipv4/sysctl_net_ipv4.c
+@@ -121,7 +121,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
+
+ tcp_get_default_congestion_control(val);
+ ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen);
+- if (ret == 0 && newval && newlen)
++ if (ret == 1 && newval && newlen)
+ ret = tcp_set_default_congestion_control(val);
+ return ret;
+ }
+diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
+index b2b2256..31dd8c5 100644
+--- a/net/ipv4/tcp_illinois.c
++++ b/net/ipv4/tcp_illinois.c
+@@ -300,7 +300,7 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
+ struct illinois *ca = inet_csk_ca(sk);
+
+ /* Multiplicative decrease */
+- return max((tp->snd_cwnd * ca->beta) >> BETA_SHIFT, 2U);
++ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
+ }
+
+
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index 53232dd..eee57e6 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -1279,7 +1279,6 @@ static int tcp_mtu_probe(struct sock *sk)
+
+ skb = tcp_send_head(sk);
+ tcp_insert_write_queue_before(nskb, skb, sk);
+- tcp_advance_send_head(sk, skb);
+
+ TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+ TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
+diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
+index e26b473..6d614c0 100644
+--- a/net/ipv6/addrconf.c
++++ b/net/ipv6/addrconf.c
+@@ -2285,6 +2285,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
+ break;
+ }
+
++ if (!idev && dev->mtu >= IPV6_MIN_MTU)
++ idev = ipv6_add_dev(dev);
++
+ if (idev)
+ idev->if_flags |= IF_READY;
+ } else {
+@@ -2349,12 +2352,18 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
+ break;
+
+ case NETDEV_CHANGEMTU:
+- if ( idev && dev->mtu >= IPV6_MIN_MTU) {
++ if (idev && dev->mtu >= IPV6_MIN_MTU) {
+ rt6_mtu_change(dev, dev->mtu);
+ idev->cnf.mtu6 = dev->mtu;
+ break;
+ }
+
++ if (!idev && dev->mtu >= IPV6_MIN_MTU) {
++ idev = ipv6_add_dev(dev);
++ if (idev)
++ break;
++ }
++
+ /* MTU falled under IPV6_MIN_MTU. Stop IPv6 on this interface. */
+
+ case NETDEV_DOWN:
+diff --git a/net/key/af_key.c b/net/key/af_key.c
+index 0f8304b..ca0db0f 100644
+--- a/net/key/af_key.c
++++ b/net/key/af_key.c
+@@ -1543,7 +1543,7 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
+
+ out_hdr = (struct sadb_msg *) out_skb->data;
+ out_hdr->sadb_msg_version = hdr->sadb_msg_version;
+- out_hdr->sadb_msg_type = SADB_DUMP;
++ out_hdr->sadb_msg_type = SADB_GET;
+ out_hdr->sadb_msg_satype = pfkey_proto2satype(proto);
+ out_hdr->sadb_msg_errno = 0;
+ out_hdr->sadb_msg_reserved = 0;
+diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
+index 15fe8f6..fe7b3d8 100644
+--- a/net/netfilter/xt_TCPMSS.c
++++ b/net/netfilter/xt_TCPMSS.c
+@@ -178,10 +178,8 @@ xt_tcpmss_target6(struct sk_buff **pskb,
+
+ nexthdr = ipv6h->nexthdr;
+ tcphoff = ipv6_skip_exthdr(*pskb, sizeof(*ipv6h), &nexthdr);
+- if (tcphoff < 0) {
+- WARN_ON(1);
++ if (tcphoff < 0)
+ return NF_DROP;
+- }
+ ret = tcpmss_mangle_packet(pskb, targinfo, tcphoff,
+ sizeof(*ipv6h) + sizeof(struct tcphdr));
+ if (ret < 0)
+diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
+index e662f1d..0d3103c 100644
+--- a/net/rxrpc/Kconfig
++++ b/net/rxrpc/Kconfig
+@@ -5,6 +5,7 @@
+ config AF_RXRPC
+ tristate "RxRPC session sockets"
+ depends on INET && EXPERIMENTAL
++ select CRYPTO
+ select KEYS
+ help
+ Say Y or M here to include support for RxRPC session sockets (just
+diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
+index d70fa30..ae80150 100644
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -1608,8 +1608,15 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
+ mutex_lock(&u->readlock);
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+- if (!skb)
++ if (!skb) {
++ unix_state_lock(sk);
++ /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
++ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
++ (sk->sk_shutdown & RCV_SHUTDOWN))
++ err = 0;
++ unix_state_unlock(sk);
+ goto out_unlock;
++ }
+
+ wake_up_interruptible(&u->peer_wait);
+
+diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
+index dfacb9c..7775488 100644
+--- a/net/xfrm/xfrm_state.c
++++ b/net/xfrm/xfrm_state.c
+@@ -371,7 +371,7 @@ int __xfrm_state_delete(struct xfrm_state *x)
+ * The xfrm_state_alloc call gives a reference, and that
+ * is what we are dropping here.
+ */
+- __xfrm_state_put(x);
++ xfrm_state_put(x);
+ err = 0;
+ }
+
diff --git a/tags/2.6.22-2/01015_linux-2.6.22.16.patch b/tags/2.6.22-2/01015_linux-2.6.22.16.patch
new file mode 100644
index 0000000..34ae110
--- /dev/null
+++ b/tags/2.6.22-2/01015_linux-2.6.22.16.patch
@@ -0,0 +1,27 @@
+Subject: Linux 2.6.22.16
+From: Greg Kroah-Hartman <gregkh@suse.de>
+
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+diff --git a/fs/namei.c b/fs/namei.c
+index 5e2d98d..8e209ce 100644
+--- a/fs/namei.c
++++ b/fs/namei.c
+@@ -1543,7 +1543,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
+ if (S_ISLNK(inode->i_mode))
+ return -ELOOP;
+
+- if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
++ if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE))
+ return -EISDIR;
+
+ error = vfs_permission(nd, acc_mode);
+@@ -1562,7 +1562,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
+ return -EACCES;
+
+ flag &= ~O_TRUNC;
+- } else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
++ } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
+ return -EROFS;
+ /*
+ * An append-only file must be opened in append mode for writing.
diff --git a/tags/2.6.22-2/01016_linux-2.6.22.17.patch b/tags/2.6.22-2/01016_linux-2.6.22.17.patch
new file mode 100644
index 0000000..4f735d5
--- /dev/null
+++ b/tags/2.6.22-2/01016_linux-2.6.22.17.patch
@@ -0,0 +1,1360 @@
+Subject: Linux 2.6.22.17
+From: Greg Kroah-Hartman <gregkh@suse.de>
+
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
+index 4f2f453..c84b7cc 100644
+--- a/arch/powerpc/mm/hash_utils_64.c
++++ b/arch/powerpc/mm/hash_utils_64.c
+@@ -795,7 +795,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
+
+ #ifdef CONFIG_PPC_MM_SLICES
+ /* We only prefault standard pages for now */
+- if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize));
++ if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize))
+ return;
+ #endif
+
+diff --git a/arch/sparc64/kernel/chmc.c b/arch/sparc64/kernel/chmc.c
+index 777d345..6d4f02e 100644
+--- a/arch/sparc64/kernel/chmc.c
++++ b/arch/sparc64/kernel/chmc.c
+@@ -1,7 +1,6 @@
+-/* $Id: chmc.c,v 1.4 2002/01/08 16:00:14 davem Exp $
+- * memctrlr.c: Driver for UltraSPARC-III memory controller.
++/* memctrlr.c: Driver for UltraSPARC-III memory controller.
+ *
+- * Copyright (C) 2001 David S. Miller (davem@redhat.com)
++ * Copyright (C) 2001, 2007 David S. Miller (davem@davemloft.net)
+ */
+
+ #include <linux/module.h>
+@@ -16,6 +15,7 @@
+ #include <linux/init.h>
+ #include <asm/spitfire.h>
+ #include <asm/chmctrl.h>
++#include <asm/cpudata.h>
+ #include <asm/oplib.h>
+ #include <asm/prom.h>
+ #include <asm/io.h>
+@@ -242,8 +242,11 @@ int chmc_getunumber(int syndrome_code,
+ */
+ static u64 read_mcreg(struct mctrl_info *mp, unsigned long offset)
+ {
+- unsigned long ret;
+- int this_cpu = get_cpu();
++ unsigned long ret, this_cpu;
++
++ preempt_disable();
++
++ this_cpu = real_hard_smp_processor_id();
+
+ if (mp->portid == this_cpu) {
+ __asm__ __volatile__("ldxa [%1] %2, %0"
+@@ -255,7 +258,8 @@ static u64 read_mcreg(struct mctrl_info *mp, unsigned long offset)
+ : "r" (mp->regs + offset),
+ "i" (ASI_PHYS_BYPASS_EC_E));
+ }
+- put_cpu();
++
++ preempt_enable();
+
+ return ret;
+ }
+diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S
+index 8059531..193791c 100644
+--- a/arch/sparc64/kernel/entry.S
++++ b/arch/sparc64/kernel/entry.S
+@@ -2593,3 +2593,15 @@ sun4v_mmustat_info:
+ retl
+ nop
+ .size sun4v_mmustat_info, .-sun4v_mmustat_info
++
++ .globl sun4v_mmu_demap_all
++ .type sun4v_mmu_demap_all,#function
++sun4v_mmu_demap_all:
++ clr %o0
++ clr %o1
++ mov HV_MMU_ALL, %o2
++ mov HV_FAST_MMU_DEMAP_ALL, %o5
++ ta HV_FAST_TRAP
++ retl
++ nop
++ .size sun4v_mmu_demap_all, .-sun4v_mmu_demap_all
+diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
+index 4dcd7d0..3ddd99c 100644
+--- a/arch/sparc64/kernel/smp.c
++++ b/arch/sparc64/kernel/smp.c
+@@ -403,7 +403,7 @@ static __inline__ void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, c
+ */
+ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
+ {
+- u64 pstate, ver;
++ u64 pstate, ver, busy_mask;
+ int nack_busy_id, is_jbus, need_more;
+
+ if (cpus_empty(mask))
+@@ -435,14 +435,20 @@ retry:
+ "i" (ASI_INTR_W));
+
+ nack_busy_id = 0;
++ busy_mask = 0;
+ {
+ int i;
+
+ for_each_cpu_mask(i, mask) {
+ u64 target = (i << 14) | 0x70;
+
+- if (!is_jbus)
++ if (is_jbus) {
++ busy_mask |= (0x1UL << (i * 2));
++ } else {
+ target |= (nack_busy_id << 24);
++ busy_mask |= (0x1UL <<
++ (nack_busy_id * 2));
++ }
+ __asm__ __volatile__(
+ "stxa %%g0, [%0] %1\n\t"
+ "membar #Sync\n\t"
+@@ -458,15 +464,16 @@ retry:
+
+ /* Now, poll for completion. */
+ {
+- u64 dispatch_stat;
++ u64 dispatch_stat, nack_mask;
+ long stuck;
+
+ stuck = 100000 * nack_busy_id;
++ nack_mask = busy_mask << 1;
+ do {
+ __asm__ __volatile__("ldxa [%%g0] %1, %0"
+ : "=r" (dispatch_stat)
+ : "i" (ASI_INTR_DISPATCH_STAT));
+- if (dispatch_stat == 0UL) {
++ if (!(dispatch_stat & (busy_mask | nack_mask))) {
+ __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
+ : : "r" (pstate));
+ if (unlikely(need_more)) {
+@@ -483,12 +490,12 @@ retry:
+ }
+ if (!--stuck)
+ break;
+- } while (dispatch_stat & 0x5555555555555555UL);
++ } while (dispatch_stat & busy_mask);
+
+ __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
+ : : "r" (pstate));
+
+- if ((dispatch_stat & ~(0x5555555555555555UL)) == 0) {
++ if (dispatch_stat & busy_mask) {
+ /* Busy bits will not clear, continue instead
+ * of freezing up on this cpu.
+ */
+diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
+index 3010227..ed2484d 100644
+--- a/arch/sparc64/mm/init.c
++++ b/arch/sparc64/mm/init.c
+@@ -1135,14 +1135,9 @@ static void __init mark_kpte_bitmap(unsigned long start, unsigned long end)
+ }
+ }
+
+-static void __init kernel_physical_mapping_init(void)
++static void __init init_kpte_bitmap(void)
+ {
+ unsigned long i;
+-#ifdef CONFIG_DEBUG_PAGEALLOC
+- unsigned long mem_alloced = 0UL;
+-#endif
+-
+- read_obp_memory("reg", &pall[0], &pall_ents);
+
+ for (i = 0; i < pall_ents; i++) {
+ unsigned long phys_start, phys_end;
+@@ -1151,14 +1146,24 @@ static void __init kernel_physical_mapping_init(void)
+ phys_end = phys_start + pall[i].reg_size;
+
+ mark_kpte_bitmap(phys_start, phys_end);
++ }
++}
+
++static void __init kernel_physical_mapping_init(void)
++{
+ #ifdef CONFIG_DEBUG_PAGEALLOC
++ unsigned long i, mem_alloced = 0UL;
++
++ for (i = 0; i < pall_ents; i++) {
++ unsigned long phys_start, phys_end;
++
++ phys_start = pall[i].phys_addr;
++ phys_end = phys_start + pall[i].reg_size;
++
+ mem_alloced += kernel_map_range(phys_start, phys_end,
+ PAGE_KERNEL);
+-#endif
+ }
+
+-#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk("Allocated %ld bytes for kernel page tables.\n",
+ mem_alloced);
+
+@@ -1400,6 +1405,10 @@ void __init paging_init(void)
+
+ inherit_prom_mappings();
+
++ read_obp_memory("reg", &pall[0], &pall_ents);
++
++ init_kpte_bitmap();
++
+ /* Ok, we can use our TLB miss and window trap handlers safely. */
+ setup_tba();
+
+@@ -1854,7 +1863,9 @@ void __flush_tlb_all(void)
+ "wrpr %0, %1, %%pstate"
+ : "=r" (pstate)
+ : "i" (PSTATE_IE));
+- if (tlb_type == spitfire) {
++ if (tlb_type == hypervisor) {
++ sun4v_mmu_demap_all();
++ } else if (tlb_type == spitfire) {
+ for (i = 0; i < 64; i++) {
+ /* Spitfire Errata #32 workaround */
+ /* NOTE: Always runs on spitfire, so no
+diff --git a/drivers/acpi/dispatcher/dsobject.c b/drivers/acpi/dispatcher/dsobject.c
+index a474ca2..954ac8c 100644
+--- a/drivers/acpi/dispatcher/dsobject.c
++++ b/drivers/acpi/dispatcher/dsobject.c
+@@ -137,6 +137,71 @@ acpi_ds_build_internal_object(struct acpi_walk_state *walk_state,
+ return_ACPI_STATUS(status);
+ }
+ }
++
++ /* Special object resolution for elements of a package */
++
++ if ((op->common.parent->common.aml_opcode == AML_PACKAGE_OP) ||
++ (op->common.parent->common.aml_opcode ==
++ AML_VAR_PACKAGE_OP)) {
++ /*
++ * Attempt to resolve the node to a value before we insert it into
++ * the package. If this is a reference to a common data type,
++ * resolve it immediately. According to the ACPI spec, package
++ * elements can only be "data objects" or method references.
++ * Attempt to resolve to an Integer, Buffer, String or Package.
++ * If cannot, return the named reference (for things like Devices,
++ * Methods, etc.) Buffer Fields and Fields will resolve to simple
++ * objects (int/buf/str/pkg).
++ *
++ * NOTE: References to things like Devices, Methods, Mutexes, etc.
++ * will remain as named references. This behavior is not described
++ * in the ACPI spec, but it appears to be an oversight.
++ */
++ obj_desc = (union acpi_operand_object *)op->common.node;
++
++ status =
++ acpi_ex_resolve_node_to_value(ACPI_CAST_INDIRECT_PTR
++ (struct
++ acpi_namespace_node,
++ &obj_desc),
++ walk_state);
++ if (ACPI_FAILURE(status)) {
++ return_ACPI_STATUS(status);
++ }
++
++ switch (op->common.node->type) {
++ /*
++ * For these types, we need the actual node, not the subobject.
++ * However, the subobject got an extra reference count above.
++ */
++ case ACPI_TYPE_MUTEX:
++ case ACPI_TYPE_METHOD:
++ case ACPI_TYPE_POWER:
++ case ACPI_TYPE_PROCESSOR:
++ case ACPI_TYPE_EVENT:
++ case ACPI_TYPE_REGION:
++ case ACPI_TYPE_DEVICE:
++ case ACPI_TYPE_THERMAL:
++
++ obj_desc =
++ (union acpi_operand_object *)op->common.
++ node;
++ break;
++
++ default:
++ break;
++ }
++
++ /*
++ * If above resolved to an operand object, we are done. Otherwise,
++ * we have a NS node, we must create the package entry as a named
++ * reference.
++ */
++ if (ACPI_GET_DESCRIPTOR_TYPE(obj_desc) !=
++ ACPI_DESC_TYPE_NAMED) {
++ goto exit;
++ }
++ }
+ }
+
+ /* Create and init a new internal ACPI object */
+@@ -156,6 +221,7 @@ acpi_ds_build_internal_object(struct acpi_walk_state *walk_state,
+ return_ACPI_STATUS(status);
+ }
+
++ exit:
+ *obj_desc_ptr = obj_desc;
+ return_ACPI_STATUS(AE_OK);
+ }
+@@ -356,12 +422,25 @@ acpi_ds_build_internal_package_obj(struct acpi_walk_state *walk_state,
+ arg = arg->common.next;
+ for (i = 0; arg && (i < element_count); i++) {
+ if (arg->common.aml_opcode == AML_INT_RETURN_VALUE_OP) {
+-
+- /* This package element is already built, just get it */
+-
+- obj_desc->package.elements[i] =
+- ACPI_CAST_PTR(union acpi_operand_object,
+- arg->common.node);
++ if (arg->common.node->type == ACPI_TYPE_METHOD) {
++ /*
++ * A method reference "looks" to the parser to be a method
++ * invocation, so we special case it here
++ */
++ arg->common.aml_opcode = AML_INT_NAMEPATH_OP;
++ status =
++ acpi_ds_build_internal_object(walk_state,
++ arg,
++ &obj_desc->
++ package.
++ elements[i]);
++ } else {
++ /* This package element is already built, just get it */
++
++ obj_desc->package.elements[i] =
++ ACPI_CAST_PTR(union acpi_operand_object,
++ arg->common.node);
++ }
+ } else {
+ status = acpi_ds_build_internal_object(walk_state, arg,
+ &obj_desc->
+diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c
+index 14ced85..0c205b0 100644
+--- a/drivers/atm/nicstar.c
++++ b/drivers/atm/nicstar.c
+@@ -625,14 +625,6 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev)
+ if (mac[i] == NULL)
+ nicstar_init_eprom(card->membase);
+
+- if (request_irq(pcidev->irq, &ns_irq_handler, IRQF_DISABLED | IRQF_SHARED, "nicstar", card) != 0)
+- {
+- printk("nicstar%d: can't allocate IRQ %d.\n", i, pcidev->irq);
+- error = 9;
+- ns_init_card_error(card, error);
+- return error;
+- }
+-
+ /* Set the VPI/VCI MSb mask to zero so we can receive OAM cells */
+ writel(0x00000000, card->membase + VPM);
+
+@@ -858,8 +850,6 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev)
+ card->iovpool.count++;
+ }
+
+- card->intcnt = 0;
+-
+ /* Configure NICStAR */
+ if (card->rct_size == 4096)
+ ns_cfg_rctsize = NS_CFG_RCTSIZE_4096_ENTRIES;
+@@ -868,6 +858,15 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev)
+
+ card->efbie = 1;
+
++ card->intcnt = 0;
++ if (request_irq(pcidev->irq, &ns_irq_handler, IRQF_DISABLED | IRQF_SHARED, "nicstar", card) != 0)
++ {
++ printk("nicstar%d: can't allocate IRQ %d.\n", i, pcidev->irq);
++ error = 9;
++ ns_init_card_error(card, error);
++ return error;
++ }
++
+ /* Register device */
+ card->atmdev = atm_dev_register("nicstar", &atm_ops, -1, NULL);
+ if (card->atmdev == NULL)
+diff --git a/drivers/char/drm/drm_vm.c b/drivers/char/drm/drm_vm.c
+index b5c5b9f..e2d7be9 100644
+--- a/drivers/char/drm/drm_vm.c
++++ b/drivers/char/drm/drm_vm.c
+@@ -520,6 +520,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma)
+ vma->vm_ops = &drm_vm_dma_ops;
+
+ vma->vm_flags |= VM_RESERVED; /* Don't swap */
++ vma->vm_flags |= VM_DONTEXPAND;
+
+ vma->vm_file = filp; /* Needed for drm_vm_open() */
+ drm_vm_open_locked(vma);
+@@ -669,6 +670,7 @@ static int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma)
+ return -EINVAL; /* This should never happen. */
+ }
+ vma->vm_flags |= VM_RESERVED; /* Don't swap */
++ vma->vm_flags |= VM_DONTEXPAND;
+
+ vma->vm_file = filp; /* Needed for drm_vm_open() */
+ drm_vm_open_locked(vma);
+diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
+index 7ac3061..5685b7a 100644
+--- a/drivers/char/mspec.c
++++ b/drivers/char/mspec.c
+@@ -265,7 +265,8 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, int type)
+ vdata->refcnt = ATOMIC_INIT(1);
+ vma->vm_private_data = vdata;
+
+- vma->vm_flags |= (VM_IO | VM_LOCKED | VM_RESERVED | VM_PFNMAP);
++ vma->vm_flags |= (VM_IO | VM_LOCKED | VM_RESERVED | VM_PFNMAP |
++ VM_DONTEXPAND);
+ if (vdata->type == MSPEC_FETCHOP || vdata->type == MSPEC_UNCACHED)
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ vma->vm_ops = &mspec_vm_ops;
+diff --git a/drivers/connector/cn_queue.c b/drivers/connector/cn_queue.c
+index 296f510..12ceed5 100644
+--- a/drivers/connector/cn_queue.c
++++ b/drivers/connector/cn_queue.c
+@@ -99,8 +99,8 @@ int cn_queue_add_callback(struct cn_queue_dev *dev, char *name, struct cb_id *id
+ spin_unlock_bh(&dev->queue_lock);
+
+ if (found) {
+- atomic_dec(&dev->refcnt);
+ cn_queue_free_callback(cbq);
++ atomic_dec(&dev->refcnt);
+ return -EINVAL;
+ }
+
+diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
+index 59b9943..ad55baa 100644
+--- a/drivers/net/cassini.c
++++ b/drivers/net/cassini.c
+@@ -336,30 +336,6 @@ static inline void cas_mask_intr(struct cas *cp)
+ cas_disable_irq(cp, i);
+ }
+
+-static inline void cas_buffer_init(cas_page_t *cp)
+-{
+- struct page *page = cp->buffer;
+- atomic_set((atomic_t *)&page->lru.next, 1);
+-}
+-
+-static inline int cas_buffer_count(cas_page_t *cp)
+-{
+- struct page *page = cp->buffer;
+- return atomic_read((atomic_t *)&page->lru.next);
+-}
+-
+-static inline void cas_buffer_inc(cas_page_t *cp)
+-{
+- struct page *page = cp->buffer;
+- atomic_inc((atomic_t *)&page->lru.next);
+-}
+-
+-static inline void cas_buffer_dec(cas_page_t *cp)
+-{
+- struct page *page = cp->buffer;
+- atomic_dec((atomic_t *)&page->lru.next);
+-}
+-
+ static void cas_enable_irq(struct cas *cp, const int ring)
+ {
+ if (ring == 0) { /* all but TX_DONE */
+@@ -497,7 +473,6 @@ static int cas_page_free(struct cas *cp, cas_page_t *page)
+ {
+ pci_unmap_page(cp->pdev, page->dma_addr, cp->page_size,
+ PCI_DMA_FROMDEVICE);
+- cas_buffer_dec(page);
+ __free_pages(page->buffer, cp->page_order);
+ kfree(page);
+ return 0;
+@@ -527,7 +502,6 @@ static cas_page_t *cas_page_alloc(struct cas *cp, const gfp_t flags)
+ page->buffer = alloc_pages(flags, cp->page_order);
+ if (!page->buffer)
+ goto page_err;
+- cas_buffer_init(page);
+ page->dma_addr = pci_map_page(cp->pdev, page->buffer, 0,
+ cp->page_size, PCI_DMA_FROMDEVICE);
+ return page;
+@@ -606,7 +580,7 @@ static void cas_spare_recover(struct cas *cp, const gfp_t flags)
+ list_for_each_safe(elem, tmp, &list) {
+ cas_page_t *page = list_entry(elem, cas_page_t, list);
+
+- if (cas_buffer_count(page) > 1)
++ if (page_count(page->buffer) > 1)
+ continue;
+
+ list_del(elem);
+@@ -1374,7 +1348,7 @@ static inline cas_page_t *cas_page_spare(struct cas *cp, const int index)
+ cas_page_t *page = cp->rx_pages[1][index];
+ cas_page_t *new;
+
+- if (cas_buffer_count(page) == 1)
++ if (page_count(page->buffer) == 1)
+ return page;
+
+ new = cas_page_dequeue(cp);
+@@ -1394,7 +1368,7 @@ static cas_page_t *cas_page_swap(struct cas *cp, const int ring,
+ cas_page_t **page1 = cp->rx_pages[1];
+
+ /* swap if buffer is in use */
+- if (cas_buffer_count(page0[index]) > 1) {
++ if (page_count(page0[index]->buffer) > 1) {
+ cas_page_t *new = cas_page_spare(cp, index);
+ if (new) {
+ page1[index] = page0[index];
+@@ -1979,6 +1953,7 @@ static int cas_rx_process_pkt(struct cas *cp, struct cas_rx_comp *rxc,
+ struct cas_page *page;
+ struct sk_buff *skb;
+ void *addr, *crcaddr;
++ __sum16 csum;
+ char *p;
+
+ hlen = CAS_VAL(RX_COMP2_HDR_SIZE, words[1]);
+@@ -2062,10 +2037,10 @@ static int cas_rx_process_pkt(struct cas *cp, struct cas_rx_comp *rxc,
+
+ skb_shinfo(skb)->nr_frags++;
+ skb->data_len += hlen - swivel;
++ skb->truesize += hlen - swivel;
+ skb->len += hlen - swivel;
+
+ get_page(page->buffer);
+- cas_buffer_inc(page);
+ frag->page = page->buffer;
+ frag->page_offset = off;
+ frag->size = hlen - swivel;
+@@ -2090,7 +2065,6 @@ static int cas_rx_process_pkt(struct cas *cp, struct cas_rx_comp *rxc,
+ frag++;
+
+ get_page(page->buffer);
+- cas_buffer_inc(page);
+ frag->page = page->buffer;
+ frag->page_offset = 0;
+ frag->size = hlen;
+@@ -2158,14 +2132,15 @@ end_copy_pkt:
+ skb_put(skb, alloclen);
+ }
+
+- i = CAS_VAL(RX_COMP4_TCP_CSUM, words[3]);
++ csum = (__force __sum16)htons(CAS_VAL(RX_COMP4_TCP_CSUM, words[3]));
+ if (cp->crc_size) {
+ /* checksum includes FCS. strip it out. */
+- i = csum_fold(csum_partial(crcaddr, cp->crc_size, i));
++ csum = csum_fold(csum_partial(crcaddr, cp->crc_size,
++ csum_unfold(csum)));
+ if (addr)
+ cas_page_unmap(addr);
+ }
+- skb->csum = ntohs(i ^ 0xffff);
++ skb->csum = csum_unfold(~csum);
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->protocol = eth_type_trans(skb, cp->dev);
+ return len;
+@@ -2253,7 +2228,7 @@ static int cas_post_rxds_ringN(struct cas *cp, int ring, int num)
+ released = 0;
+ while (entry != last) {
+ /* make a new buffer if it's still in use */
+- if (cas_buffer_count(page[entry]) > 1) {
++ if (page_count(page[entry]->buffer) > 1) {
+ cas_page_t *new = cas_page_dequeue(cp);
+ if (!new) {
+ /* let the timer know that we need to
+diff --git a/drivers/net/cassini.h b/drivers/net/cassini.h
+index a970804..a201431 100644
+--- a/drivers/net/cassini.h
++++ b/drivers/net/cassini.h
+@@ -4122,8 +4122,8 @@ cas_saturn_patch_t cas_saturn_patch[] = {
+ inserted into
+ outgoing frame. */
+ struct cas_tx_desc {
+- u64 control;
+- u64 buffer;
++ __le64 control;
++ __le64 buffer;
+ };
+
+ /* descriptor ring for free buffers contains page-sized buffers. the index
+@@ -4131,8 +4131,8 @@ struct cas_tx_desc {
+ * the completion ring.
+ */
+ struct cas_rx_desc {
+- u64 index;
+- u64 buffer;
++ __le64 index;
++ __le64 buffer;
+ };
+
+ /* received packets are put on the completion ring. */
+@@ -4210,10 +4210,10 @@ struct cas_rx_desc {
+ #define RX_INDEX_RELEASE 0x0000000000002000ULL
+
+ struct cas_rx_comp {
+- u64 word1;
+- u64 word2;
+- u64 word3;
+- u64 word4;
++ __le64 word1;
++ __le64 word2;
++ __le64 word3;
++ __le64 word4;
+ };
+
+ enum link_state {
+@@ -4252,7 +4252,7 @@ struct cas_init_block {
+ struct cas_rx_comp rxcs[N_RX_COMP_RINGS][INIT_BLOCK_RX_COMP];
+ struct cas_rx_desc rxds[N_RX_DESC_RINGS][INIT_BLOCK_RX_DESC];
+ struct cas_tx_desc txds[N_TX_RINGS][INIT_BLOCK_TX];
+- u64 tx_compwb;
++ __le64 tx_compwb;
+ };
+
+ /* tiny buffers to deal with target abort issue. we allocate a bit
+diff --git a/drivers/net/chelsio/cxgb2.c b/drivers/net/chelsio/cxgb2.c
+index 231ce43..a82a1fa 100644
+--- a/drivers/net/chelsio/cxgb2.c
++++ b/drivers/net/chelsio/cxgb2.c
+@@ -370,6 +370,8 @@ static char stats_strings[][ETH_GSTRING_LEN] = {
+ "TxInternalMACXmitError",
+ "TxFramesWithExcessiveDeferral",
+ "TxFCSErrors",
++ "TxJumboFramesOk",
++ "TxJumboOctetsOk",
+
+ "RxOctetsOK",
+ "RxOctetsBad",
+@@ -388,15 +390,16 @@ static char stats_strings[][ETH_GSTRING_LEN] = {
+ "RxInRangeLengthErrors",
+ "RxOutOfRangeLengthField",
+ "RxFrameTooLongErrors",
++ "RxJumboFramesOk",
++ "RxJumboOctetsOk",
+
+ /* Port stats */
+- "RxPackets",
+ "RxCsumGood",
+- "TxPackets",
+ "TxCsumOffload",
+ "TxTso",
+ "RxVlan",
+ "TxVlan",
++ "TxNeedHeadroom",
+
+ /* Interrupt stats */
+ "rx drops",
+@@ -454,23 +457,56 @@ static void get_stats(struct net_device *dev, struct ethtool_stats *stats,
+ const struct cmac_statistics *s;
+ const struct sge_intr_counts *t;
+ struct sge_port_stats ss;
+- unsigned int len;
+
+ s = mac->ops->statistics_update(mac, MAC_STATS_UPDATE_FULL);
+-
+- len = sizeof(u64)*(&s->TxFCSErrors + 1 - &s->TxOctetsOK);
+- memcpy(data, &s->TxOctetsOK, len);
+- data += len;
+-
+- len = sizeof(u64)*(&s->RxFrameTooLongErrors + 1 - &s->RxOctetsOK);
+- memcpy(data, &s->RxOctetsOK, len);
+- data += len;
+-
++ t = t1_sge_get_intr_counts(adapter->sge);
+ t1_sge_get_port_stats(adapter->sge, dev->if_port, &ss);
+- memcpy(data, &ss, sizeof(ss));
+- data += sizeof(ss);
+
+- t = t1_sge_get_intr_counts(adapter->sge);
++ *data++ = s->TxOctetsOK;
++ *data++ = s->TxOctetsBad;
++ *data++ = s->TxUnicastFramesOK;
++ *data++ = s->TxMulticastFramesOK;
++ *data++ = s->TxBroadcastFramesOK;
++ *data++ = s->TxPauseFrames;
++ *data++ = s->TxFramesWithDeferredXmissions;
++ *data++ = s->TxLateCollisions;
++ *data++ = s->TxTotalCollisions;
++ *data++ = s->TxFramesAbortedDueToXSCollisions;
++ *data++ = s->TxUnderrun;
++ *data++ = s->TxLengthErrors;
++ *data++ = s->TxInternalMACXmitError;
++ *data++ = s->TxFramesWithExcessiveDeferral;
++ *data++ = s->TxFCSErrors;
++ *data++ = s->TxJumboFramesOK;
++ *data++ = s->TxJumboOctetsOK;
++
++ *data++ = s->RxOctetsOK;
++ *data++ = s->RxOctetsBad;
++ *data++ = s->RxUnicastFramesOK;
++ *data++ = s->RxMulticastFramesOK;
++ *data++ = s->RxBroadcastFramesOK;
++ *data++ = s->RxPauseFrames;
++ *data++ = s->RxFCSErrors;
++ *data++ = s->RxAlignErrors;
++ *data++ = s->RxSymbolErrors;
++ *data++ = s->RxDataErrors;
++ *data++ = s->RxSequenceErrors;
++ *data++ = s->RxRuntErrors;
++ *data++ = s->RxJabberErrors;
++ *data++ = s->RxInternalMACRcvError;
++ *data++ = s->RxInRangeLengthErrors;
++ *data++ = s->RxOutOfRangeLengthField;
++ *data++ = s->RxFrameTooLongErrors;
++ *data++ = s->RxJumboFramesOK;
++ *data++ = s->RxJumboOctetsOK;
++
++ *data++ = ss.rx_cso_good;
++ *data++ = ss.tx_cso;
++ *data++ = ss.tx_tso;
++ *data++ = ss.vlan_xtract;
++ *data++ = ss.vlan_insert;
++ *data++ = ss.tx_need_hdrroom;
++
+ *data++ = t->rx_drops;
+ *data++ = t->pure_rsps;
+ *data++ = t->unhandled_irqs;
+diff --git a/drivers/net/chelsio/pm3393.c b/drivers/net/chelsio/pm3393.c
+index 678778a..2117c4f 100644
+--- a/drivers/net/chelsio/pm3393.c
++++ b/drivers/net/chelsio/pm3393.c
+@@ -45,7 +45,7 @@
+
+ #include <linux/crc32.h>
+
+-#define OFFSET(REG_ADDR) (REG_ADDR << 2)
++#define OFFSET(REG_ADDR) ((REG_ADDR) << 2)
+
+ /* Max frame size PM3393 can handle. Includes Ethernet header and CRC. */
+ #define MAX_FRAME_SIZE 9600
+@@ -428,69 +428,26 @@ static int pm3393_set_speed_duplex_fc(struct cmac *cmac, int speed, int duplex,
+ return 0;
+ }
+
+-static void pm3393_rmon_update(struct adapter *adapter, u32 offs, u64 *val,
+- int over)
+-{
+- u32 val0, val1, val2;
+-
+- t1_tpi_read(adapter, offs, &val0);
+- t1_tpi_read(adapter, offs + 4, &val1);
+- t1_tpi_read(adapter, offs + 8, &val2);
+-
+- *val &= ~0ull << 40;
+- *val |= val0 & 0xffff;
+- *val |= (val1 & 0xffff) << 16;
+- *val |= (u64)(val2 & 0xff) << 32;
+-
+- if (over)
+- *val += 1ull << 40;
++#define RMON_UPDATE(mac, name, stat_name) \
++{ \
++ t1_tpi_read((mac)->adapter, OFFSET(name), &val0); \
++ t1_tpi_read((mac)->adapter, OFFSET((name)+1), &val1); \
++ t1_tpi_read((mac)->adapter, OFFSET((name)+2), &val2); \
++ (mac)->stats.stat_name = (u64)(val0 & 0xffff) | \
++ ((u64)(val1 & 0xffff) << 16) | \
++ ((u64)(val2 & 0xff) << 32) | \
++ ((mac)->stats.stat_name & \
++ 0xffffff0000000000ULL); \
++ if (ro & \
++ (1ULL << ((name - SUNI1x10GEXP_REG_MSTAT_COUNTER_0_LOW) >> 2))) \
++ (mac)->stats.stat_name += 1ULL << 40; \
+ }
+
+ static const struct cmac_statistics *pm3393_update_statistics(struct cmac *mac,
+ int flag)
+ {
+- static struct {
+- unsigned int reg;
+- unsigned int offset;
+- } hw_stats [] = {
+-
+-#define HW_STAT(name, stat_name) \
+- { name, (&((struct cmac_statistics *)NULL)->stat_name) - (u64 *)NULL }
+-
+- /* Rx stats */
+- HW_STAT(RxOctetsReceivedOK, RxOctetsOK),
+- HW_STAT(RxUnicastFramesReceivedOK, RxUnicastFramesOK),
+- HW_STAT(RxMulticastFramesReceivedOK, RxMulticastFramesOK),
+- HW_STAT(RxBroadcastFramesReceivedOK, RxBroadcastFramesOK),
+- HW_STAT(RxPAUSEMACCtrlFramesReceived, RxPauseFrames),
+- HW_STAT(RxFrameCheckSequenceErrors, RxFCSErrors),
+- HW_STAT(RxFramesLostDueToInternalMACErrors,
+- RxInternalMACRcvError),
+- HW_STAT(RxSymbolErrors, RxSymbolErrors),
+- HW_STAT(RxInRangeLengthErrors, RxInRangeLengthErrors),
+- HW_STAT(RxFramesTooLongErrors , RxFrameTooLongErrors),
+- HW_STAT(RxJabbers, RxJabberErrors),
+- HW_STAT(RxFragments, RxRuntErrors),
+- HW_STAT(RxUndersizedFrames, RxRuntErrors),
+- HW_STAT(RxJumboFramesReceivedOK, RxJumboFramesOK),
+- HW_STAT(RxJumboOctetsReceivedOK, RxJumboOctetsOK),
+-
+- /* Tx stats */
+- HW_STAT(TxOctetsTransmittedOK, TxOctetsOK),
+- HW_STAT(TxFramesLostDueToInternalMACTransmissionError,
+- TxInternalMACXmitError),
+- HW_STAT(TxTransmitSystemError, TxFCSErrors),
+- HW_STAT(TxUnicastFramesTransmittedOK, TxUnicastFramesOK),
+- HW_STAT(TxMulticastFramesTransmittedOK, TxMulticastFramesOK),
+- HW_STAT(TxBroadcastFramesTransmittedOK, TxBroadcastFramesOK),
+- HW_STAT(TxPAUSEMACCtrlFramesTransmitted, TxPauseFrames),
+- HW_STAT(TxJumboFramesReceivedOK, TxJumboFramesOK),
+- HW_STAT(TxJumboOctetsReceivedOK, TxJumboOctetsOK)
+- }, *p = hw_stats;
+- u64 ro;
+- u32 val0, val1, val2, val3;
+- u64 *stats = (u64 *) &mac->stats;
+- unsigned int i;
++ u64 ro;
++ u32 val0, val1, val2, val3;
+
+ /* Snap the counters */
+ pmwrite(mac, SUNI1x10GEXP_REG_MSTAT_CONTROL,
+@@ -504,14 +461,35 @@ static const struct cmac_statistics *pm3393_update_statistics(struct cmac *mac,
+ ro = ((u64)val0 & 0xffff) | (((u64)val1 & 0xffff) << 16) |
+ (((u64)val2 & 0xffff) << 32) | (((u64)val3 & 0xffff) << 48);
+
+- for (i = 0; i < ARRAY_SIZE(hw_stats); i++) {
+- unsigned reg = p->reg - SUNI1x10GEXP_REG_MSTAT_COUNTER_0_LOW;
+-
+- pm3393_rmon_update((mac)->adapter, OFFSET(p->reg),
+- stats + p->offset, ro & (reg >> 2));
+- }
+-
+-
++ /* Rx stats */
++ RMON_UPDATE(mac, RxOctetsReceivedOK, RxOctetsOK);
++ RMON_UPDATE(mac, RxUnicastFramesReceivedOK, RxUnicastFramesOK);
++ RMON_UPDATE(mac, RxMulticastFramesReceivedOK, RxMulticastFramesOK);
++ RMON_UPDATE(mac, RxBroadcastFramesReceivedOK, RxBroadcastFramesOK);
++ RMON_UPDATE(mac, RxPAUSEMACCtrlFramesReceived, RxPauseFrames);
++ RMON_UPDATE(mac, RxFrameCheckSequenceErrors, RxFCSErrors);
++ RMON_UPDATE(mac, RxFramesLostDueToInternalMACErrors,
++ RxInternalMACRcvError);
++ RMON_UPDATE(mac, RxSymbolErrors, RxSymbolErrors);
++ RMON_UPDATE(mac, RxInRangeLengthErrors, RxInRangeLengthErrors);
++ RMON_UPDATE(mac, RxFramesTooLongErrors , RxFrameTooLongErrors);
++ RMON_UPDATE(mac, RxJabbers, RxJabberErrors);
++ RMON_UPDATE(mac, RxFragments, RxRuntErrors);
++ RMON_UPDATE(mac, RxUndersizedFrames, RxRuntErrors);
++ RMON_UPDATE(mac, RxJumboFramesReceivedOK, RxJumboFramesOK);
++ RMON_UPDATE(mac, RxJumboOctetsReceivedOK, RxJumboOctetsOK);
++
++ /* Tx stats */
++ RMON_UPDATE(mac, TxOctetsTransmittedOK, TxOctetsOK);
++ RMON_UPDATE(mac, TxFramesLostDueToInternalMACTransmissionError,
++ TxInternalMACXmitError);
++ RMON_UPDATE(mac, TxTransmitSystemError, TxFCSErrors);
++ RMON_UPDATE(mac, TxUnicastFramesTransmittedOK, TxUnicastFramesOK);
++ RMON_UPDATE(mac, TxMulticastFramesTransmittedOK, TxMulticastFramesOK);
++ RMON_UPDATE(mac, TxBroadcastFramesTransmittedOK, TxBroadcastFramesOK);
++ RMON_UPDATE(mac, TxPAUSEMACCtrlFramesTransmitted, TxPauseFrames);
++ RMON_UPDATE(mac, TxJumboFramesReceivedOK, TxJumboFramesOK);
++ RMON_UPDATE(mac, TxJumboOctetsReceivedOK, TxJumboOctetsOK);
+
+ return &mac->stats;
+ }
+diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c
+index e4f874a..d77f1eb 100644
+--- a/drivers/net/chelsio/sge.c
++++ b/drivers/net/chelsio/sge.c
+@@ -986,11 +986,10 @@ void t1_sge_get_port_stats(const struct sge *sge, int port,
+ for_each_possible_cpu(cpu) {
+ struct sge_port_stats *st = per_cpu_ptr(sge->port_stats[port], cpu);
+
+- ss->rx_packets += st->rx_packets;
+ ss->rx_cso_good += st->rx_cso_good;
+- ss->tx_packets += st->tx_packets;
+ ss->tx_cso += st->tx_cso;
+ ss->tx_tso += st->tx_tso;
++ ss->tx_need_hdrroom += st->tx_need_hdrroom;
+ ss->vlan_xtract += st->vlan_xtract;
+ ss->vlan_insert += st->vlan_insert;
+ }
+@@ -1379,11 +1378,10 @@ static void sge_rx(struct sge *sge, struct freelQ *fl, unsigned int len)
+ }
+ __skb_pull(skb, sizeof(*p));
+
+- skb->dev->last_rx = jiffies;
+ st = per_cpu_ptr(sge->port_stats[p->iff], smp_processor_id());
+- st->rx_packets++;
+
+ skb->protocol = eth_type_trans(skb, adapter->port[p->iff].dev);
++ skb->dev->last_rx = jiffies;
+ if ((adapter->flags & RX_CSUM_ENABLED) && p->csum == 0xffff &&
+ skb->protocol == htons(ETH_P_IP) &&
+ (skb->data[9] == IPPROTO_TCP || skb->data[9] == IPPROTO_UDP)) {
+@@ -1851,7 +1849,8 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ {
+ struct adapter *adapter = dev->priv;
+ struct sge *sge = adapter->sge;
+- struct sge_port_stats *st = per_cpu_ptr(sge->port_stats[dev->if_port], smp_processor_id());
++ struct sge_port_stats *st = per_cpu_ptr(sge->port_stats[dev->if_port],
++ smp_processor_id());
+ struct cpl_tx_pkt *cpl;
+ struct sk_buff *orig_skb = skb;
+ int ret;
+@@ -1859,6 +1858,18 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ if (skb->protocol == htons(ETH_P_CPL5))
+ goto send;
+
++ /*
++ * We are using a non-standard hard_header_len.
++ * Allocate more header room in the rare cases it is not big enough.
++ */
++ if (unlikely(skb_headroom(skb) < dev->hard_header_len - ETH_HLEN)) {
++ skb = skb_realloc_headroom(skb, sizeof(struct cpl_tx_pkt_lso));
++ ++st->tx_need_hdrroom;
++ dev_kfree_skb_any(orig_skb);
++ if (!skb)
++ return NETDEV_TX_OK;
++ }
++
+ if (skb_shinfo(skb)->gso_size) {
+ int eth_type;
+ struct cpl_tx_pkt_lso *hdr;
+@@ -1892,24 +1903,6 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ return NETDEV_TX_OK;
+ }
+
+- /*
+- * We are using a non-standard hard_header_len and some kernel
+- * components, such as pktgen, do not handle it right.
+- * Complain when this happens but try to fix things up.
+- */
+- if (unlikely(skb_headroom(skb) < dev->hard_header_len - ETH_HLEN)) {
+- pr_debug("%s: headroom %d header_len %d\n", dev->name,
+- skb_headroom(skb), dev->hard_header_len);
+-
+- if (net_ratelimit())
+- printk(KERN_ERR "%s: inadequate headroom in "
+- "Tx packet\n", dev->name);
+- skb = skb_realloc_headroom(skb, sizeof(*cpl));
+- dev_kfree_skb_any(orig_skb);
+- if (!skb)
+- return NETDEV_TX_OK;
+- }
+-
+ if (!(adapter->flags & UDP_CSUM_CAPABLE) &&
+ skb->ip_summed == CHECKSUM_PARTIAL &&
+ ip_hdr(skb)->protocol == IPPROTO_UDP) {
+@@ -1955,7 +1948,6 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ cpl->vlan_valid = 0;
+
+ send:
+- st->tx_packets++;
+ dev->trans_start = jiffies;
+ ret = t1_sge_tx(skb, adapter, 0, dev);
+
+diff --git a/drivers/net/chelsio/sge.h b/drivers/net/chelsio/sge.h
+index d132a0e..80165f9 100644
+--- a/drivers/net/chelsio/sge.h
++++ b/drivers/net/chelsio/sge.h
+@@ -57,13 +57,12 @@ struct sge_intr_counts {
+ };
+
+ struct sge_port_stats {
+- u64 rx_packets; /* # of Ethernet packets received */
+ u64 rx_cso_good; /* # of successful RX csum offloads */
+- u64 tx_packets; /* # of TX packets */
+ u64 tx_cso; /* # of TX checksum offloads */
+ u64 tx_tso; /* # of TSO requests */
+ u64 vlan_xtract; /* # of VLAN tag extractions */
+ u64 vlan_insert; /* # of VLAN tag insertions */
++ u64 tx_need_hdrroom; /* # of TX skbs in need of more header room */
+ };
+
+ struct sk_buff;
+diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c
+index 60d2944..4ebb6ea 100644
+--- a/drivers/net/usb/kaweth.c
++++ b/drivers/net/usb/kaweth.c
+@@ -70,7 +70,7 @@
+ #define KAWETH_TX_TIMEOUT (5 * HZ)
+ #define KAWETH_SCRATCH_SIZE 32
+ #define KAWETH_FIRMWARE_BUF_SIZE 4096
+-#define KAWETH_CONTROL_TIMEOUT (30 * HZ)
++#define KAWETH_CONTROL_TIMEOUT (30000)
+
+ #define KAWETH_STATUS_BROKEN 0x0000001
+ #define KAWETH_STATUS_CLOSING 0x0000002
+diff --git a/drivers/net/usb/mcs7830.c b/drivers/net/usb/mcs7830.c
+index 6240b97..3bbc5c4 100644
+--- a/drivers/net/usb/mcs7830.c
++++ b/drivers/net/usb/mcs7830.c
+@@ -94,7 +94,7 @@ static int mcs7830_get_reg(struct usbnet *dev, u16 index, u16 size, void *data)
+
+ ret = usb_control_msg(xdev, usb_rcvctrlpipe(xdev, 0), MCS7830_RD_BREQ,
+ MCS7830_RD_BMREQ, 0x0000, index, data,
+- size, msecs_to_jiffies(MCS7830_CTRL_TIMEOUT));
++ size, MCS7830_CTRL_TIMEOUT);
+ return ret;
+ }
+
+@@ -105,7 +105,7 @@ static int mcs7830_set_reg(struct usbnet *dev, u16 index, u16 size, void *data)
+
+ ret = usb_control_msg(xdev, usb_sndctrlpipe(xdev, 0), MCS7830_WR_BREQ,
+ MCS7830_WR_BMREQ, 0x0000, index, data,
+- size, msecs_to_jiffies(MCS7830_CTRL_TIMEOUT));
++ size, MCS7830_CTRL_TIMEOUT);
+ return ret;
+ }
+
+diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
+index 749e7d8..9f90c10 100644
+--- a/drivers/pci/quirks.c
++++ b/drivers/pci/quirks.c
+@@ -465,6 +465,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, quirk
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_0, quirk_ich6_lpc_acpi );
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_2, quirk_ich6_lpc_acpi );
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_3, quirk_ich6_lpc_acpi );
++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, quirk_ich6_lpc_acpi );
++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4, quirk_ich6_lpc_acpi );
++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_2, quirk_ich6_lpc_acpi );
++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_4, quirk_ich6_lpc_acpi );
++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, quirk_ich6_lpc_acpi );
++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_8, quirk_ich6_lpc_acpi );
+
+ /*
+ * VIA ACPI: One IO region pointed to by longword at
+diff --git a/fs/exec.c b/fs/exec.c
+index 3da429d..224e973 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1561,6 +1561,12 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
+ but keep the previous behaviour for now. */
+ if (!ispipe && !S_ISREG(inode->i_mode))
+ goto close_fail;
++ /*
++ * Dont allow local users get cute and trick others to coredump
++ * into their pre-created files:
++ */
++ if (inode->i_uid != current->fsuid)
++ goto close_fail;
+ if (!file->f_op)
+ goto close_fail;
+ if (!file->f_op->write)
+diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
+index 70a6911..f87de97 100644
+--- a/fs/ncpfs/mmap.c
++++ b/fs/ncpfs/mmap.c
+@@ -47,9 +47,6 @@ static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area,
+ pos = address - area->vm_start + (area->vm_pgoff << PAGE_SHIFT);
+
+ count = PAGE_SIZE;
+- if (address + PAGE_SIZE > area->vm_end) {
+- count = area->vm_end - address;
+- }
+ /* what we can read in one go */
+ bufsize = NCP_SERVER(inode)->buffer_size;
+
+diff --git a/include/asm-sparc64/hypervisor.h b/include/asm-sparc64/hypervisor.h
+index db2130a..a63a1f6 100644
+--- a/include/asm-sparc64/hypervisor.h
++++ b/include/asm-sparc64/hypervisor.h
+@@ -709,6 +709,10 @@ extern unsigned long sun4v_mmu_tsb_ctx0(unsigned long num_descriptions,
+ */
+ #define HV_FAST_MMU_DEMAP_ALL 0x24
+
++#ifndef __ASSEMBLY__
++extern void sun4v_mmu_demap_all(void);
++#endif
++
+ /* mmu_map_perm_addr()
+ * TRAP: HV_FAST_TRAP
+ * FUNCTION: HV_FAST_MMU_MAP_PERM_ADDR
+diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
+index 887c2ce..c6c9d48 100644
+--- a/include/linux/pci_ids.h
++++ b/include/linux/pci_ids.h
+@@ -2285,6 +2285,8 @@
+ #define PCI_DEVICE_ID_INTEL_ICH9_4 0x2914
+ #define PCI_DEVICE_ID_INTEL_ICH9_5 0x2919
+ #define PCI_DEVICE_ID_INTEL_ICH9_6 0x2930
++#define PCI_DEVICE_ID_INTEL_ICH9_7 0x2916
++#define PCI_DEVICE_ID_INTEL_ICH9_8 0x2918
+ #define PCI_DEVICE_ID_INTEL_82855PM_HB 0x3340
+ #define PCI_DEVICE_ID_INTEL_82830_HB 0x3575
+ #define PCI_DEVICE_ID_INTEL_82830_CGC 0x3577
+diff --git a/kernel/relay.c b/kernel/relay.c
+index 95db8c7..24db7e8 100644
+--- a/kernel/relay.c
++++ b/kernel/relay.c
+@@ -91,6 +91,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+ return -EINVAL;
+
+ vma->vm_ops = &relay_file_mmap_ops;
++ vma->vm_flags |= VM_DONTEXPAND;
+ vma->vm_private_data = buf;
+ buf->chan->cb->buf_mapped(buf, filp);
+
+diff --git a/mm/mmap.c b/mm/mmap.c
+index 906ed40..33fb671 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -2157,7 +2157,7 @@ int install_special_mapping(struct mm_struct *mm,
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+
+- vma->vm_flags = vm_flags | mm->def_flags;
++ vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
+ vma->vm_page_prot = protection_map[vma->vm_flags & 7];
+
+ vma->vm_ops = &special_mapping_vmops;
+diff --git a/net/atm/mpc.c b/net/atm/mpc.c
+index 7c85aa5..181c1c8 100644
+--- a/net/atm/mpc.c
++++ b/net/atm/mpc.c
+@@ -542,6 +542,13 @@ static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev)
+ if (eth->h_proto != htons(ETH_P_IP))
+ goto non_ip; /* Multi-Protocol Over ATM :-) */
+
++ /* Weed out funny packets (e.g., AF_PACKET or raw). */
++ if (skb->len < ETH_HLEN + sizeof(struct iphdr))
++ goto non_ip;
++ skb_set_network_header(skb, ETH_HLEN);
++ if (skb->len < ETH_HLEN + ip_hdr(skb)->ihl * 4 || ip_hdr(skb)->ihl < 5)
++ goto non_ip;
++
+ while (i < mpc->number_of_mps_macs) {
+ if (!compare_ether_addr(eth->h_dest, (mpc->mps_macs + i*ETH_ALEN)))
+ if ( send_via_shortcut(skb, mpc) == 0 ) /* try shortcut */
+diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
+index 0ddaff0..8a9f0ac 100644
+--- a/net/ax25/ax25_in.c
++++ b/net/ax25/ax25_in.c
+@@ -124,7 +124,7 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
+ }
+
+ skb_pull(skb, 1); /* Remove PID */
+- skb_reset_mac_header(skb);
++ skb->mac_header = skb->network_header;
+ skb_reset_network_header(skb);
+ skb->dev = ax25->ax25_dev->dev;
+ skb->pkt_type = PACKET_HOST;
+diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
+index 0dcc245..9607d78 100644
+--- a/net/ipv4/devinet.c
++++ b/net/ipv4/devinet.c
+@@ -1030,7 +1030,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (named++ == 0)
+ continue;
+- dot = strchr(ifa->ifa_label, ':');
++ dot = strchr(old, ':');
+ if (dot == NULL) {
+ sprintf(old, ":%d", named);
+ dot = old;
+diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
+index 6328293..724b612 100644
+--- a/net/ipv4/ip_gre.c
++++ b/net/ipv4/ip_gre.c
+@@ -613,7 +613,7 @@ static int ipgre_rcv(struct sk_buff *skb)
+ offset += 4;
+ }
+
+- skb_reset_mac_header(skb);
++ skb->mac_header = skb->network_header;
+ __pskb_pull(skb, offset);
+ skb_reset_network_header(skb);
+ skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index 4aa2551..8f443ed 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -2885,11 +2885,10 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
+ int idx, s_idx;
+
+ s_h = cb->args[0];
++ if (s_h < 0)
++ s_h = 0;
+ s_idx = idx = cb->args[1];
+- for (h = 0; h <= rt_hash_mask; h++) {
+- if (h < s_h) continue;
+- if (h > s_h)
+- s_idx = 0;
++ for (h = s_h; h <= rt_hash_mask; h++) {
+ rcu_read_lock_bh();
+ for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
+ rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
+@@ -2906,6 +2905,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
+ dst_release(xchg(&skb->dst, NULL));
+ }
+ rcu_read_unlock_bh();
++ s_idx = 0;
+ }
+
+ done:
+diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
+index dcd7e32..73708b5 100644
+--- a/net/irda/af_irda.c
++++ b/net/irda/af_irda.c
+@@ -1115,8 +1115,6 @@ static int irda_create(struct socket *sock, int protocol)
+ self->max_sdu_size_rx = TTP_SAR_UNBOUND;
+ break;
+ default:
+- IRDA_ERROR("%s: protocol not supported!\n",
+- __FUNCTION__);
+ return -ESOCKTNOSUPPORT;
+ }
+ break;
+diff --git a/net/key/af_key.c b/net/key/af_key.c
+index ca0db0f..0be3be2 100644
+--- a/net/key/af_key.c
++++ b/net/key/af_key.c
+@@ -2777,12 +2777,22 @@ static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp)
+
+ static inline int aalg_tmpl_set(struct xfrm_tmpl *t, struct xfrm_algo_desc *d)
+ {
+- return t->aalgos & (1 << d->desc.sadb_alg_id);
++ unsigned int id = d->desc.sadb_alg_id;
++
++ if (id >= sizeof(t->aalgos) * 8)
++ return 0;
++
++ return (t->aalgos >> id) & 1;
+ }
+
+ static inline int ealg_tmpl_set(struct xfrm_tmpl *t, struct xfrm_algo_desc *d)
+ {
+- return t->ealgos & (1 << d->desc.sadb_alg_id);
++ unsigned int id = d->desc.sadb_alg_id;
++
++ if (id >= sizeof(t->ealgos) * 8)
++ return 0;
++
++ return (t->ealgos >> id) & 1;
+ }
+
+ static int count_ah_combs(struct xfrm_tmpl *t)
+diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c
+index c7b5d93..69e77d5 100644
+--- a/net/netrom/nr_dev.c
++++ b/net/netrom/nr_dev.c
+@@ -56,7 +56,7 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev)
+
+ /* Spoof incoming device */
+ skb->dev = dev;
+- skb_reset_mac_header(skb);
++ skb->mac_header = skb->network_header;
+ skb_reset_network_header(skb);
+ skb->pkt_type = PACKET_HOST;
+
+diff --git a/net/x25/x25_forward.c b/net/x25/x25_forward.c
+index 8738ec7..3447803 100644
+--- a/net/x25/x25_forward.c
++++ b/net/x25/x25_forward.c
+@@ -118,13 +118,14 @@ int x25_forward_data(int lci, struct x25_neigh *from, struct sk_buff *skb) {
+ goto out;
+
+ if ( (skbn = pskb_copy(skb, GFP_ATOMIC)) == NULL){
+- goto out;
++ goto output;
+
+ }
+ x25_transmit_link(skbn, nb);
+
+- x25_neigh_put(nb);
+ rc = 1;
++output:
++ x25_neigh_put(nb);
+ out:
+ return rc;
+ }
+diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
+index b48f06f..1c86a23 100644
+--- a/net/xfrm/xfrm_policy.c
++++ b/net/xfrm/xfrm_policy.c
+@@ -1479,8 +1479,9 @@ restart:
+
+ if (sk && sk->sk_policy[1]) {
+ policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
++ err = PTR_ERR(policy);
+ if (IS_ERR(policy))
+- return PTR_ERR(policy);
++ goto dropdst;
+ }
+
+ if (!policy) {
+@@ -1491,8 +1492,9 @@ restart:
+
+ policy = flow_cache_lookup(fl, dst_orig->ops->family,
+ dir, xfrm_policy_lookup);
++ err = PTR_ERR(policy);
+ if (IS_ERR(policy))
+- return PTR_ERR(policy);
++ goto dropdst;
+ }
+
+ if (!policy)
+@@ -1661,8 +1663,9 @@ restart:
+ return 0;
+
+ error:
+- dst_release(dst_orig);
+ xfrm_pols_put(pols, npols);
++dropdst:
++ dst_release(dst_orig);
+ *dst_p = NULL;
+ return err;
+ }
+diff --git a/sound/oss/via82cxxx_audio.c b/sound/oss/via82cxxx_audio.c
+index 5d3c037..f95aa09 100644
+--- a/sound/oss/via82cxxx_audio.c
++++ b/sound/oss/via82cxxx_audio.c
+@@ -2104,6 +2104,7 @@ static struct page * via_mm_nopage (struct vm_area_struct * vma,
+ {
+ struct via_info *card = vma->vm_private_data;
+ struct via_channel *chan = &card->ch_out;
++ unsigned long max_bufs;
+ struct page *dmapage;
+ unsigned long pgoff;
+ int rd, wr;
+@@ -2127,14 +2128,11 @@ static struct page * via_mm_nopage (struct vm_area_struct * vma,
+ rd = card->ch_in.is_mapped;
+ wr = card->ch_out.is_mapped;
+
+-#ifndef VIA_NDEBUG
+- {
+- unsigned long max_bufs = chan->frag_number;
+- if (rd && wr) max_bufs *= 2;
+- /* via_dsp_mmap() should ensure this */
+- assert (pgoff < max_bufs);
+- }
+-#endif
++ max_bufs = chan->frag_number;
++ if (rd && wr)
++ max_bufs *= 2;
++ if (pgoff >= max_bufs)
++ return NOPAGE_SIGBUS;
+
+ /* if full-duplex (read+write) and we have two sets of bufs,
+ * then the playback buffers come first, sez soundcard.c */
+diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c
+index b76b3dd..e617d7e 100644
+--- a/sound/usb/usx2y/usX2Yhwdep.c
++++ b/sound/usb/usx2y/usX2Yhwdep.c
+@@ -88,7 +88,7 @@ static int snd_us428ctls_mmap(struct snd_hwdep * hw, struct file *filp, struct v
+ us428->us428ctls_sharedmem->CtlSnapShotLast = -2;
+ }
+ area->vm_ops = &us428ctls_vm_ops;
+- area->vm_flags |= VM_RESERVED;
++ area->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+ area->vm_private_data = hw->private_data;
+ return 0;
+ }
+diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c
+index a5e7bcd..6e70520 100644
+--- a/sound/usb/usx2y/usx2yhwdeppcm.c
++++ b/sound/usb/usx2y/usx2yhwdeppcm.c
+@@ -728,7 +728,7 @@ static int snd_usX2Y_hwdep_pcm_mmap(struct snd_hwdep * hw, struct file *filp, st
+ return -ENODEV;
+ }
+ area->vm_ops = &snd_usX2Y_hwdep_pcm_vm_ops;
+- area->vm_flags |= VM_RESERVED;
++ area->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+ area->vm_private_data = hw->private_data;
+ return 0;
+ }
diff --git a/tags/2.6.22-2/01017_linux-2.6.22.18.patch b/tags/2.6.22-2/01017_linux-2.6.22.18.patch
new file mode 100644
index 0000000..4f87816
--- /dev/null
+++ b/tags/2.6.22-2/01017_linux-2.6.22.18.patch
@@ -0,0 +1,14 @@
+diff --git a/fs/splice.c b/fs/splice.c
+index e263d3b..dbbe267 100644
+--- a/fs/splice.c
++++ b/fs/splice.c
+@@ -1182,6 +1182,9 @@ static int get_iovec_page_array(const struct iovec __user *iov,
+ if (unlikely(!base))
+ break;
+
++ if (!access_ok(VERIFY_READ, base, len))
++ break;
++
+ /*
+ * Get this base offset and number of pages, then map
+ * in the user pages.
diff --git a/tags/2.6.22-2/20001_x86-early-quirks-unificiation.patch1 b/tags/2.6.22-2/20001_x86-early-quirks-unificiation.patch1
new file mode 100644
index 0000000..12dd843
--- /dev/null
+++ b/tags/2.6.22-2/20001_x86-early-quirks-unificiation.patch1
@@ -0,0 +1,237 @@
+Subject: x86: Unify i386 and x86-64 early quirks
+
+They were already very similar; just use the same file now.
+
+Cc: lenb@kernel.org
+
+Signed-off-by: Andi Kleen <ak@suse.de>
+
+---
+ arch/i386/kernel/Makefile | 2
+ arch/i386/kernel/acpi/Makefile | 3 -
+ arch/i386/kernel/acpi/earlyquirk.c | 84 -------------------------------------
+ arch/i386/kernel/setup.c | 4 -
+ arch/x86_64/kernel/early-quirks.c | 11 ++++
+ include/asm-i386/acpi.h | 6 --
+ include/asm-i386/dma.h | 2
+ include/asm-x86_64/io_apic.h | 2
+ include/asm-x86_64/proto.h | 2
+ 9 files changed, 18 insertions(+), 98 deletions(-)
+
+--- a/arch/i386/kernel/Makefile 2007-08-27 14:01:19.000000000 -0400
++++ b/arch/i386/kernel/Makefile 2007-08-27 14:02:11.000000000 -0400
+@@ -17,6 +17,7 @@ obj-$(CONFIG_MCA) += mca.o
+ obj-$(CONFIG_X86_MSR) += msr.o
+ obj-$(CONFIG_X86_CPUID) += cpuid.o
+ obj-$(CONFIG_MICROCODE) += microcode.o
++obj-$(CONFIG_PCI) += early-quirks.o
+ obj-$(CONFIG_APM) += apm.o
+ obj-$(CONFIG_X86_SMP) += smp.o smpboot.o tsc_sync.o
+ obj-$(CONFIG_SMP) += smpcommon.o
+@@ -84,4 +85,5 @@ $(obj)/vsyscall-syms.o: $(src)/vsyscall.
+
+ k8-y += ../../x86_64/kernel/k8.o
+ stacktrace-y += ../../x86_64/kernel/stacktrace.o
++early-quirks-y += ../../x86_64/kernel/early-quirks.o
+
+--- a/arch/i386/kernel/acpi/Makefile 2007-07-08 19:32:17.000000000 -0400
++++ b/arch/i386/kernel/acpi/Makefile 2007-08-27 14:02:11.000000000 -0400
+@@ -1,7 +1,4 @@
+ obj-$(CONFIG_ACPI) += boot.o
+-ifneq ($(CONFIG_PCI),)
+-obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o
+-endif
+ obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o
+
+ ifneq ($(CONFIG_ACPI_PROCESSOR),)
+--- a/arch/i386/kernel/acpi/earlyquirk.c 2007-07-08 19:32:17.000000000 -0400
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,84 +0,0 @@
+-/*
+- * Do early PCI probing for bug detection when the main PCI subsystem is
+- * not up yet.
+- */
+-#include <linux/init.h>
+-#include <linux/kernel.h>
+-#include <linux/pci.h>
+-#include <linux/acpi.h>
+-
+-#include <asm/pci-direct.h>
+-#include <asm/acpi.h>
+-#include <asm/apic.h>
+-
+-#ifdef CONFIG_ACPI
+-
+-static int __init nvidia_hpet_check(struct acpi_table_header *header)
+-{
+- return 0;
+-}
+-#endif
+-
+-static int __init check_bridge(int vendor, int device)
+-{
+-#ifdef CONFIG_ACPI
+- static int warned;
+- /* According to Nvidia all timer overrides are bogus unless HPET
+- is enabled. */
+- if (!acpi_use_timer_override && vendor == PCI_VENDOR_ID_NVIDIA) {
+- if (!warned && acpi_table_parse(ACPI_SIG_HPET,
+- nvidia_hpet_check)) {
+- warned = 1;
+- acpi_skip_timer_override = 1;
+- printk(KERN_INFO "Nvidia board "
+- "detected. Ignoring ACPI "
+- "timer override.\n");
+- printk(KERN_INFO "If you got timer trouble "
+- "try acpi_use_timer_override\n");
+-
+- }
+- }
+-#endif
+- if (vendor == PCI_VENDOR_ID_ATI && timer_over_8254 == 1) {
+- timer_over_8254 = 0;
+- printk(KERN_INFO "ATI board detected. Disabling timer routing "
+- "over 8254.\n");
+- }
+- return 0;
+-}
+-
+-void __init check_acpi_pci(void)
+-{
+- int num, slot, func;
+-
+- /* Assume the machine supports type 1. If not it will
+- always read ffffffff and should not have any side effect.
+- Actually a few buggy systems can machine check. Allow the user
+- to disable it by command line option at least -AK */
+- if (!early_pci_allowed())
+- return;
+-
+- /* Poor man's PCI discovery */
+- for (num = 0; num < 32; num++) {
+- for (slot = 0; slot < 32; slot++) {
+- for (func = 0; func < 8; func++) {
+- u32 class;
+- u32 vendor;
+- class = read_pci_config(num, slot, func,
+- PCI_CLASS_REVISION);
+- if (class == 0xffffffff)
+- break;
+-
+- if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
+- continue;
+-
+- vendor = read_pci_config(num, slot, func,
+- PCI_VENDOR_ID);
+-
+- if (check_bridge(vendor & 0xffff, vendor >> 16))
+- return;
+- }
+-
+- }
+- }
+-}
+--- a/arch/i386/kernel/setup.c 2007-07-08 19:32:17.000000000 -0400
++++ b/arch/i386/kernel/setup.c 2007-08-27 14:01:19.000000000 -0400
+@@ -627,9 +627,7 @@ void __init setup_arch(char **cmdline_p)
+ #endif
+
+ #ifdef CONFIG_PCI
+-#ifdef CONFIG_X86_IO_APIC
+- check_acpi_pci(); /* Checks more than just ACPI actually */
+-#endif
++ early_quirks();
+ #endif
+
+ #ifdef CONFIG_ACPI
+--- a/arch/x86_64/kernel/early-quirks.c 2007-07-08 19:32:17.000000000 -0400
++++ b/arch/x86_64/kernel/early-quirks.c 2007-08-27 14:04:27.000000000 -0400
+@@ -13,9 +13,14 @@
+ #include <linux/acpi.h>
+ #include <linux/pci_ids.h>
+ #include <asm/pci-direct.h>
+-#include <asm/proto.h>
++#include <asm/io_apic.h>
++#include <asm/apic.h>
+ #include <asm/dma.h>
+
++#ifdef CONFIG_X86_64
++#include <asm/proto.h>
++#endif
++
+ static void __init via_bugs(void)
+ {
+ #ifdef CONFIG_IOMMU
+@@ -39,6 +44,7 @@ static int __init nvidia_hpet_check(stru
+ static void __init nvidia_bugs(void)
+ {
+ #ifdef CONFIG_ACPI
++#ifdef CONFIG_X86_IO_APIC
+ /*
+ * All timer overrides on Nvidia are
+ * wrong unless HPET is enabled.
+@@ -58,17 +64,20 @@ static void __init nvidia_bugs(void)
+ "try acpi_use_timer_override\n");
+ }
+ #endif
++#endif
+ /* RED-PEN skip them on mptables too? */
+
+ }
+
+ static void __init ati_bugs(void)
+ {
++#ifdef CONFIG_X86_IO_APIC
+ if (timer_over_8254 == 1) {
+ timer_over_8254 = 0;
+ printk(KERN_INFO
+ "ATI board detected. Disabling timer routing over 8254.\n");
+ }
++#endif
+ }
+
+ struct chipset {
+--- a/include/asm-i386/acpi.h 2007-07-08 19:32:17.000000000 -0400
++++ b/include/asm-i386/acpi.h 2007-08-27 14:02:03.000000000 -0400
+@@ -81,11 +81,7 @@ int __acpi_release_global_lock(unsigned
+ :"=r"(n_hi), "=r"(n_lo) \
+ :"0"(n_hi), "1"(n_lo))
+
+-#ifdef CONFIG_X86_IO_APIC
+-extern void check_acpi_pci(void);
+-#else
+-static inline void check_acpi_pci(void) { }
+-#endif
++extern void early_quirks(void);
+
+ #ifdef CONFIG_ACPI
+ extern int acpi_lapic;
+--- a/include/asm-i386/dma.h 2007-07-08 19:32:17.000000000 -0400
++++ b/include/asm-i386/dma.h 2007-08-27 14:01:19.000000000 -0400
+@@ -294,4 +294,6 @@ extern int isa_dma_bridge_buggy;
+ #define isa_dma_bridge_buggy (0)
+ #endif
+
++#define MAX_DMA32_PFN ((4UL*1024*1024*1024) >> PAGE_SHIFT)
++
+ #endif /* _ASM_DMA_H */
+--- a/include/asm-x86_64/io_apic.h 2007-07-08 19:32:17.000000000 -0400
++++ b/include/asm-x86_64/io_apic.h 2007-08-27 14:01:51.000000000 -0400
+@@ -127,4 +127,6 @@ void enable_NMI_through_LVT0 (void * dum
+
+ extern spinlock_t i8259A_lock;
+
++extern int timer_over_8254;
++
+ #endif
+--- a/include/asm-x86_64/proto.h 2007-07-08 19:32:17.000000000 -0400
++++ b/include/asm-x86_64/proto.h 2007-08-27 14:01:19.000000000 -0400
+@@ -106,8 +106,6 @@ extern int fix_aperture;
+ extern int reboot_force;
+ extern int notsc_setup(char *);
+
+-extern int timer_over_8254;
+-
+ extern int gsi_irq_sharing(int gsi);
+
+ extern void smp_local_timer_interrupt(void);
diff --git a/tags/2.6.22-2/20002_add-console-use-vt.patch1 b/tags/2.6.22-2/20002_add-console-use-vt.patch1
new file mode 100644
index 0000000..a63a027
--- /dev/null
+++ b/tags/2.6.22-2/20002_add-console-use-vt.patch1
@@ -0,0 +1,58 @@
+Subject: add console_use_vt
+From: kraxel@suse.de
+Patch-mainline: no
+
+$subject says all
+
+---
+ drivers/char/tty_io.c | 7 ++++++-
+ include/linux/console.h | 1 +
+ 2 files changed, 7 insertions(+), 1 deletion(-)
+
+--- a/drivers/char/tty_io.c 2007-08-27 14:01:21.000000000 -0400
++++ b/drivers/char/tty_io.c 2007-08-27 14:01:24.000000000 -0400
+@@ -133,6 +133,8 @@ LIST_HEAD(tty_drivers); /* linked list
+ DEFINE_MUTEX(tty_mutex);
+ EXPORT_SYMBOL(tty_mutex);
+
++int console_use_vt = 1;
++
+ #ifdef CONFIG_UNIX98_PTYS
+ extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */
+ extern int pty_limit; /* Config limit on Unix98 ptys */
+@@ -2581,7 +2583,7 @@ retry_open:
+ goto got_driver;
+ }
+ #ifdef CONFIG_VT
+- if (device == MKDEV(TTY_MAJOR,0)) {
++ if (console_use_vt && device == MKDEV(TTY_MAJOR,0)) {
+ extern struct tty_driver *console_driver;
+ driver = console_driver;
+ index = fg_console;
+@@ -4048,6 +4050,8 @@ static int __init tty_init(void)
+ #endif
+
+ #ifdef CONFIG_VT
++ if (!console_use_vt)
++ goto out_vt;
+ cdev_init(&vc0_cdev, &console_fops);
+ if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) ||
+ register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0)
+@@ -4055,6 +4059,7 @@ static int __init tty_init(void)
+ device_create(tty_class, NULL, MKDEV(TTY_MAJOR, 0), "tty0");
+
+ vty_init();
++ out_vt:
+ #endif
+ return 0;
+ }
+--- a/include/linux/console.h 2007-08-27 14:01:24.000000000 -0400
++++ b/include/linux/console.h 2007-08-27 14:01:24.000000000 -0400
+@@ -63,6 +63,7 @@ extern const struct consw dummy_con; /*
+ extern const struct consw vga_con; /* VGA text console */
+ extern const struct consw newport_con; /* SGI Newport console */
+ extern const struct consw prom_con; /* SPARC PROM console */
++extern int console_use_vt;
+
+ int con_is_bound(const struct consw *csw);
+ int register_con_driver(const struct consw *csw, int first, int last);
diff --git a/tags/2.6.22-2/20003_linux-2.6.19-rc1-kexec-move_segment_code-i386.patch1 b/tags/2.6.22-2/20003_linux-2.6.19-rc1-kexec-move_segment_code-i386.patch1
new file mode 100644
index 0000000..0e266e1
--- /dev/null
+++ b/tags/2.6.22-2/20003_linux-2.6.19-rc1-kexec-move_segment_code-i386.patch1
@@ -0,0 +1,172 @@
+Subject: kexec: Move asm segment handling code to the assembly file (i386)
+From: http://xenbits.xensource.com/xen-unstable.hg (tip 13816)
+Patch-mainline: obsolete
+
+This patch moves the idt, gdt, and segment handling code from machine_kexec.c
+to relocate_kernel.S. The main reason behind this move is to avoid code
+duplication in the Xen hypervisor. With this patch all code required to kexec
+is put on the control page.
+
+On top of that this patch also counts as a cleanup - I think it is much
+nicer to write assembly directly in assembly files than wrap inline assembly
+in C functions for no apparent reason.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+Acked-by: jbeulich@novell.com
+---
+
+ Applies to 2.6.19-rc1.
+
+ arch/i386/kernel/machine_kexec.c | 59 -------------------------------------
+ arch/i386/kernel/relocate_kernel.S | 58 +++++++++++++++++++++++++++++++++---
+ 2 files changed, 53 insertions(+), 64 deletions(-)
+
+--- a/arch/i386/kernel/machine_kexec.c 2007-08-27 12:09:26.000000000 -0400
++++ b/arch/i386/kernel/machine_kexec.c 2007-08-27 14:02:11.000000000 -0400
+@@ -29,48 +29,6 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED
+ static u32 kexec_pte0[1024] PAGE_ALIGNED;
+ static u32 kexec_pte1[1024] PAGE_ALIGNED;
+
+-static void set_idt(void *newidt, __u16 limit)
+-{
+- struct Xgt_desc_struct curidt;
+-
+- /* ia32 supports unaliged loads & stores */
+- curidt.size = limit;
+- curidt.address = (unsigned long)newidt;
+-
+- load_idt(&curidt);
+-};
+-
+-
+-static void set_gdt(void *newgdt, __u16 limit)
+-{
+- struct Xgt_desc_struct curgdt;
+-
+- /* ia32 supports unaligned loads & stores */
+- curgdt.size = limit;
+- curgdt.address = (unsigned long)newgdt;
+-
+- load_gdt(&curgdt);
+-};
+-
+-static void load_segments(void)
+-{
+-#define __STR(X) #X
+-#define STR(X) __STR(X)
+-
+- __asm__ __volatile__ (
+- "\tljmp $"STR(__KERNEL_CS)",$1f\n"
+- "\t1:\n"
+- "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+- "\tmovl %%eax,%%ds\n"
+- "\tmovl %%eax,%%es\n"
+- "\tmovl %%eax,%%fs\n"
+- "\tmovl %%eax,%%gs\n"
+- "\tmovl %%eax,%%ss\n"
+- ::: "eax", "memory");
+-#undef STR
+-#undef __STR
+-}
+-
+ /*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+@@ -127,23 +85,6 @@ NORET_TYPE void machine_kexec(struct kim
+ page_list[PA_PTE_1] = __pa(kexec_pte1);
+ page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+
+- /* The segment registers are funny things, they have both a
+- * visible and an invisible part. Whenever the visible part is
+- * set to a specific selector, the invisible part is loaded
+- * with from a table in memory. At no other time is the
+- * descriptor table in memory accessed.
+- *
+- * I take advantage of this here by force loading the
+- * segments, before I zap the gdt with an invalid value.
+- */
+- load_segments();
+- /* The gdt & idt are now invalid.
+- * If you want to load them you must set up your own idt & gdt.
+- */
+- set_gdt(phys_to_virt(0),0);
+- set_idt(phys_to_virt(0),0);
+-
+- /* now call it */
+ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+ image->start, cpu_has_pae);
+ }
+--- a/arch/i386/kernel/relocate_kernel.S 2007-08-27 12:09:26.000000000 -0400
++++ b/arch/i386/kernel/relocate_kernel.S 2007-08-27 14:01:24.000000000 -0400
+@@ -154,14 +154,45 @@ relocate_new_kernel:
+ movl PTR(PA_PGD)(%ebp), %eax
+ movl %eax, %cr3
+
++ /* setup idt */
++ movl %edi, %eax
++ addl $(idt_48 - relocate_kernel), %eax
++ lidtl (%eax)
++
++ /* setup gdt */
++ movl %edi, %eax
++ addl $(gdt - relocate_kernel), %eax
++ movl %edi, %esi
++ addl $((gdt_48 - relocate_kernel) + 2), %esi
++ movl %eax, (%esi)
++
++ movl %edi, %eax
++ addl $(gdt_48 - relocate_kernel), %eax
++ lgdtl (%eax)
++
++ /* setup data segment registers */
++ mov $(gdt_ds - gdt), %eax
++ mov %eax, %ds
++ mov %eax, %es
++ mov %eax, %fs
++ mov %eax, %gs
++ mov %eax, %ss
++
+ /* setup a new stack at the end of the physical control page */
+ lea 4096(%edi), %esp
+
+- /* jump to identity mapped page */
+- movl %edi, %eax
+- addl $(identity_mapped - relocate_kernel), %eax
+- pushl %eax
+- ret
++ /* load new code segment and jump to identity mapped page */
++ movl %edi, %esi
++ xorl %eax, %eax
++ pushl %eax
++ pushl %esi
++ pushl %eax
++ movl $(gdt_cs - gdt), %eax
++ pushl %eax
++ movl %edi, %eax
++ addl $(identity_mapped - relocate_kernel),%eax
++ pushl %eax
++ iretl
+
+ identity_mapped:
+ /* store the start address on the stack */
+@@ -250,3 +281,20 @@ identity_mapped:
+ xorl %edi, %edi
+ xorl %ebp, %ebp
+ ret
++
++ .align 16
++gdt:
++ .quad 0x0000000000000000 /* NULL descriptor */
++gdt_cs:
++ .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
++gdt_ds:
++ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
++gdt_end:
++
++gdt_48:
++ .word gdt_end - gdt - 1 /* limit */
++ .long 0 /* base - filled in by code above */
++
++idt_48:
++ .word 0 /* limit */
++ .long 0 /* base */
diff --git a/tags/2.6.22-2/20004_linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch1 b/tags/2.6.22-2/20004_linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch1
new file mode 100644
index 0000000..d3abb10
--- /dev/null
+++ b/tags/2.6.22-2/20004_linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch1
@@ -0,0 +1,164 @@
+Subject: kexec: Move asm segment handling code to the assembly file (x86_64)
+From: http://xenbits.xensource.com/xen-unstable.hg (tip 13816)
+Patch-mainline: obsolete
+
+This patch moves the idt, gdt, and segment handling code from machine_kexec.c
+to relocate_kernel.S. The main reason behind this move is to avoid code
+duplication in the Xen hypervisor. With this patch all code required to kexec
+is put on the control page.
+
+On top of that this patch also counts as a cleanup - I think it is much
+nicer to write assembly directly in assembly files than wrap inline assembly
+in C functions for no apparent reason.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+Acked-by: jbeulich@novell.com
+---
+
+ Applies to 2.6.19-rc1.
+
+ arch/x86_64/kernel/machine_kexec.c | 58 -----------------------------------
+ arch/x86_64/kernel/relocate_kernel.S | 50 +++++++++++++++++++++++++++---
+ 2 files changed, 45 insertions(+), 63 deletions(-)
+
+--- a/arch/x86_64/kernel/machine_kexec.c 2007-08-27 12:09:26.000000000 -0400
++++ b/arch/x86_64/kernel/machine_kexec.c 2007-08-27 14:02:11.000000000 -0400
+@@ -112,47 +112,6 @@ static int init_pgtable(struct kimage *i
+ return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+ }
+
+-static void set_idt(void *newidt, u16 limit)
+-{
+- struct desc_ptr curidt;
+-
+- /* x86-64 supports unaliged loads & stores */
+- curidt.size = limit;
+- curidt.address = (unsigned long)newidt;
+-
+- __asm__ __volatile__ (
+- "lidtq %0\n"
+- : : "m" (curidt)
+- );
+-};
+-
+-
+-static void set_gdt(void *newgdt, u16 limit)
+-{
+- struct desc_ptr curgdt;
+-
+- /* x86-64 supports unaligned loads & stores */
+- curgdt.size = limit;
+- curgdt.address = (unsigned long)newgdt;
+-
+- __asm__ __volatile__ (
+- "lgdtq %0\n"
+- : : "m" (curgdt)
+- );
+-};
+-
+-static void load_segments(void)
+-{
+- __asm__ __volatile__ (
+- "\tmovl %0,%%ds\n"
+- "\tmovl %0,%%es\n"
+- "\tmovl %0,%%ss\n"
+- "\tmovl %0,%%fs\n"
+- "\tmovl %0,%%gs\n"
+- : : "a" (__KERNEL_DS) : "memory"
+- );
+-}
+-
+ int machine_kexec_prepare(struct kimage *image)
+ {
+ unsigned long start_pgtable;
+@@ -209,23 +168,6 @@ NORET_TYPE void machine_kexec(struct kim
+ page_list[PA_TABLE_PAGE] =
+ (unsigned long)__pa(page_address(image->control_code_page));
+
+- /* The segment registers are funny things, they have both a
+- * visible and an invisible part. Whenever the visible part is
+- * set to a specific selector, the invisible part is loaded
+- * with from a table in memory. At no other time is the
+- * descriptor table in memory accessed.
+- *
+- * I take advantage of this here by force loading the
+- * segments, before I zap the gdt with an invalid value.
+- */
+- load_segments();
+- /* The gdt & idt are now invalid.
+- * If you want to load them you must set up your own idt & gdt.
+- */
+- set_gdt(phys_to_virt(0),0);
+- set_idt(phys_to_virt(0),0);
+-
+- /* now call it */
+ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+ image->start);
+ }
+--- a/arch/x86_64/kernel/relocate_kernel.S 2007-08-27 12:09:26.000000000 -0400
++++ b/arch/x86_64/kernel/relocate_kernel.S 2007-08-27 14:01:24.000000000 -0400
+@@ -159,13 +159,39 @@ relocate_new_kernel:
+ movq PTR(PA_PGD)(%rsi), %r9
+ movq %r9, %cr3
+
++ /* setup idt */
++ movq %r8, %rax
++ addq $(idt_80 - relocate_kernel), %rax
++ lidtq (%rax)
++
++ /* setup gdt */
++ movq %r8, %rax
++ addq $(gdt - relocate_kernel), %rax
++ movq %r8, %r9
++ addq $((gdt_80 - relocate_kernel) + 2), %r9
++ movq %rax, (%r9)
++
++ movq %r8, %rax
++ addq $(gdt_80 - relocate_kernel), %rax
++ lgdtq (%rax)
++
++ /* setup data segment registers */
++ xorl %eax, %eax
++ movl %eax, %ds
++ movl %eax, %es
++ movl %eax, %fs
++ movl %eax, %gs
++ movl %eax, %ss
++
+ /* setup a new stack at the end of the physical control page */
+ lea 4096(%r8), %rsp
+
+- /* jump to identity mapped page */
+- addq $(identity_mapped - relocate_kernel), %r8
+- pushq %r8
+- ret
++ /* load new code segment and jump to identity mapped page */
++ movq %r8, %rax
++ addq $(identity_mapped - relocate_kernel), %rax
++ pushq $(gdt_cs - gdt)
++ pushq %rax
++ lretq
+
+ identity_mapped:
+ /* store the start address on the stack */
+@@ -272,5 +298,19 @@ identity_mapped:
+ xorq %r13, %r13
+ xorq %r14, %r14
+ xorq %r15, %r15
+-
+ ret
++
++ .align 16
++gdt:
++ .quad 0x0000000000000000 /* NULL descriptor */
++gdt_cs:
++ .quad 0x00af9a000000ffff
++gdt_end:
++
++gdt_80:
++ .word gdt_end - gdt - 1 /* limit */
++ .quad 0 /* base - filled in by code above */
++
++idt_80:
++ .word 0 /* limit */
++ .quad 0 /* base */
diff --git a/tags/2.6.22-2/20005_blktap-aio-16_03_06.patch1 b/tags/2.6.22-2/20005_blktap-aio-16_03_06.patch1
new file mode 100644
index 0000000..9331e9f
--- /dev/null
+++ b/tags/2.6.22-2/20005_blktap-aio-16_03_06.patch1
@@ -0,0 +1,209 @@
+Subject: AIO/POLL interaction
+From: http://xenbits.xensource.com/xen-3.1-testing.hg (tip 15042)
+Acked-by: jbeulich@novell.com
+
+---
+ fs/aio.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++----
+ include/linux/aio.h | 5 ++
+ 2 files changed, 116 insertions(+), 9 deletions(-)
+
+--- a/fs/aio.c 2007-08-27 12:09:26.000000000 -0400
++++ b/fs/aio.c 2007-08-27 14:01:24.000000000 -0400
+@@ -36,6 +36,11 @@
+ #include <asm/uaccess.h>
+ #include <asm/mmu_context.h>
+
++#ifdef CONFIG_EPOLL
++#include <linux/poll.h>
++#include <linux/anon_inodes.h>
++#endif
++
+ #if DEBUG > 1
+ #define dprintk printk
+ #else
+@@ -1009,6 +1014,11 @@ put_rq:
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+
++#ifdef CONFIG_EPOLL
++ if (ctx->file && waitqueue_active(&ctx->poll_wait))
++ wake_up(&ctx->poll_wait);
++#endif
++
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+ return ret;
+ }
+@@ -1016,6 +1026,8 @@ put_rq:
+ /* aio_read_evt
+ * Pull an event off of the ioctx's event ring. Returns the number of
+ * events fetched (0 or 1 ;-)
++ * If ent parameter is 0, just returns the number of events that would
++ * be fetched.
+ * FIXME: make this use cmpxchg.
+ * TODO: make the ringbuffer user mmap()able (requires FIXME).
+ */
+@@ -1038,13 +1050,18 @@ static int aio_read_evt(struct kioctx *i
+
+ head = ring->head % info->nr;
+ if (head != ring->tail) {
+- struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+- *ent = *evp;
+- head = (head + 1) % info->nr;
+- smp_mb(); /* finish reading the event before updatng the head */
+- ring->head = head;
+- ret = 1;
+- put_aio_ring_event(evp, KM_USER1);
++ if (ent) { /* event requested */
++ struct io_event *evp =
++ aio_ring_event(info, head, KM_USER1);
++ *ent = *evp;
++ head = (head + 1) % info->nr;
++ /* finish reading the event before updatng the head */
++ smp_mb();
++ ring->head = head;
++ ret = 1;
++ put_aio_ring_event(evp, KM_USER1);
++ } else /* only need to know availability */
++ ret = 1;
+ }
+ spin_unlock(&info->ring_lock);
+
+@@ -1227,9 +1244,78 @@ static void io_destroy(struct kioctx *io
+
+ aio_cancel_all(ioctx);
+ wait_for_all_aios(ioctx);
++#ifdef CONFIG_EPOLL
++ /* forget the poll file, but it's up to the user to close it */
++ if (ioctx->file) {
++ ioctx->file->private_data = 0;
++ ioctx->file = 0;
++ }
++#endif
+ put_ioctx(ioctx); /* once for the lookup */
+ }
+
++#ifdef CONFIG_EPOLL
++
++static int aio_queue_fd_close(struct inode *inode, struct file *file)
++{
++ struct kioctx *ioctx = file->private_data;
++ if (ioctx) {
++ file->private_data = 0;
++ spin_lock_irq(&ioctx->ctx_lock);
++ ioctx->file = 0;
++ spin_unlock_irq(&ioctx->ctx_lock);
++ }
++ return 0;
++}
++
++static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait)
++{ unsigned int pollflags = 0;
++ struct kioctx *ioctx = file->private_data;
++
++ if (ioctx) {
++
++ spin_lock_irq(&ioctx->ctx_lock);
++ /* Insert inside our poll wait queue */
++ poll_wait(file, &ioctx->poll_wait, wait);
++
++ /* Check our condition */
++ if (aio_read_evt(ioctx, 0))
++ pollflags = POLLIN | POLLRDNORM;
++ spin_unlock_irq(&ioctx->ctx_lock);
++ }
++
++ return pollflags;
++}
++
++static const struct file_operations aioq_fops = {
++ .release = aio_queue_fd_close,
++ .poll = aio_queue_fd_poll
++};
++
++/* make_aio_fd:
++ * Create a file descriptor that can be used to poll the event queue.
++ * Based on the excellent epoll code.
++ */
++
++static int make_aio_fd(struct kioctx *ioctx)
++{
++ int error, fd;
++ struct inode *inode;
++ struct file *file;
++
++ error = anon_inode_getfd(&fd, &inode, &file, "[aioq]",
++ &aioq_fops, ioctx);
++ if (error)
++ return error;
++
++ /* associate the file with the IO context */
++ ioctx->file = file;
++ init_waitqueue_head(&ioctx->poll_wait);
++ return fd;
++}
++#endif
++
++
+ /* sys_io_setup:
+ * Create an aio_context capable of receiving at least nr_events.
+ * ctxp must not point to an aio_context that already exists, and
+@@ -1242,18 +1328,30 @@ static void io_destroy(struct kioctx *io
+ * resources are available. May fail with -EFAULT if an invalid
+ * pointer is passed for ctxp. Will fail with -ENOSYS if not
+ * implemented.
++ *
++ * To request a selectable fd, the user context has to be initialized
++ * to 1, instead of 0, and the return value is the fd.
++ * This keeps the system call compatible, since a non-zero value
++ * was not allowed so far.
+ */
+ asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp)
+ {
+ struct kioctx *ioctx = NULL;
+ unsigned long ctx;
+ long ret;
++ int make_fd = 0;
+
+ ret = get_user(ctx, ctxp);
+ if (unlikely(ret))
+ goto out;
+
+ ret = -EINVAL;
++#ifdef CONFIG_EPOLL
++ if (ctx == 1) {
++ make_fd = 1;
++ ctx = 0;
++ }
++#endif
+ if (unlikely(ctx || nr_events == 0)) {
+ pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
+ ctx, nr_events);
+@@ -1264,8 +1362,12 @@ asmlinkage long sys_io_setup(unsigned nr
+ ret = PTR_ERR(ioctx);
+ if (!IS_ERR(ioctx)) {
+ ret = put_user(ioctx->user_id, ctxp);
+- if (!ret)
+- return 0;
++#ifdef CONFIG_EPOLL
++ if (make_fd && ret >= 0)
++ ret = make_aio_fd(ioctx);
++#endif
++ if (ret >= 0)
++ return ret;
+
+ get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
+ io_destroy(ioctx);
+--- a/include/linux/aio.h 2007-08-27 12:09:26.000000000 -0400
++++ b/include/linux/aio.h 2007-08-27 14:01:24.000000000 -0400
+@@ -201,6 +201,11 @@ struct kioctx {
+ struct aio_ring_info ring_info;
+
+ struct delayed_work wq;
++#ifdef CONFIG_EPOLL
++ // poll integration
++ wait_queue_head_t poll_wait;
++ struct file *file;
++#endif
+ };
+
+ /* prototypes */
diff --git a/tags/2.6.22-2/20006_fix-ide-cd-pio-mode.patch1 b/tags/2.6.22-2/20006_fix-ide-cd-pio-mode.patch1
new file mode 100644
index 0000000..1ca415c
--- /dev/null
+++ b/tags/2.6.22-2/20006_fix-ide-cd-pio-mode.patch1
@@ -0,0 +1,36 @@
+Subject: Fix IDE CD-drive PIO mode.
+From: http://xenbits.xensource.com/xen-3.1-testing.hg (tip 15042)
+Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
+
+CD drives in PIO mode don't work under Xen because of a change in Linux
+between 2.6.12 and 2.6.16, as a result of the following thread:
+http://lists.parisc-linux.org/pipermail/parisc-linux/2005-August/027197.html
+
+The change breaks systems which have highmem and a swiotlb because the
+ide-cd driver doesn't use the swiotlb, resulting in read/writes to/from
+highmem pages in PIO mode not working any longer. Xen kernels usually have
+both highmem and a swiotlb.
+
+Acked-by: Jan Beulich <jbeulich@novell.com>
+
+---
+ drivers/ide/ide-lib.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/drivers/ide/ide-lib.c 2007-08-27 12:09:26.000000000 -0400
++++ b/drivers/ide/ide-lib.c 2007-08-27 14:02:05.000000000 -0400
+@@ -341,10 +341,10 @@ void ide_toggle_bounce(ide_drive_t *driv
+ {
+ u64 addr = BLK_BOUNCE_HIGH; /* dma64_addr_t */
+
+- if (!PCI_DMA_BUS_IS_PHYS) {
+- addr = BLK_BOUNCE_ANY;
+- } else if (on && drive->media == ide_disk) {
+- if (HWIF(drive)->pci_dev)
++ if (on && drive->media == ide_disk) {
++ if (!PCI_DMA_BUS_IS_PHYS)
++ addr = BLK_BOUNCE_ANY;
++ else if (HWIF(drive)->pci_dev)
+ addr = HWIF(drive)->pci_dev->dma_mask;
+ }
+
diff --git a/tags/2.6.22-2/20007_i386-mach-io-check-nmi.patch1 b/tags/2.6.22-2/20007_i386-mach-io-check-nmi.patch1
new file mode 100644
index 0000000..c6fccf6
--- /dev/null
+++ b/tags/2.6.22-2/20007_i386-mach-io-check-nmi.patch1
@@ -0,0 +1,53 @@
+Subject: Separate out clearing of NMI I/O check error
+From: http://xenbits.xensource.com/xen-3.1-testing.hg (tip 15042)
+Patch-mainline: obsolete
+Acked-by: jbeulich@novell.com
+
+---
+ arch/i386/kernel/traps.c | 9 +--------
+ include/asm-i386/mach-default/mach_traps.h | 12 ++++++++++++
+ 2 files changed, 13 insertions(+), 8 deletions(-)
+
+--- a/arch/i386/kernel/traps.c 2007-08-27 14:01:24.000000000 -0400
++++ b/arch/i386/kernel/traps.c 2007-08-27 14:01:24.000000000 -0400
+@@ -656,18 +656,11 @@ mem_parity_error(unsigned char reason, s
+ static __kprobes void
+ io_check_error(unsigned char reason, struct pt_regs * regs)
+ {
+- unsigned long i;
+-
+ printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+ show_registers(regs);
+
+ /* Re-enable the IOCK line, wait for a few seconds */
+- reason = (reason & 0xf) | 8;
+- outb(reason, 0x61);
+- i = 2000;
+- while (--i) udelay(1000);
+- reason &= ~8;
+- outb(reason, 0x61);
++ clear_io_check_error(reason);
+ }
+
+ static __kprobes void
+--- a/include/asm-i386/mach-default/mach_traps.h 2007-08-27 12:09:26.000000000 -0400
++++ b/include/asm-i386/mach-default/mach_traps.h 2007-08-27 14:01:24.000000000 -0400
+@@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig
+ outb(reason, 0x61);
+ }
+
++static inline void clear_io_check_error(unsigned char reason)
++{
++ unsigned long i;
++
++ reason = (reason & 0xf) | 8;
++ outb(reason, 0x61);
++ i = 2000;
++ while (--i) udelay(1000);
++ reason &= ~8;
++ outb(reason, 0x61);
++}
++
+ static inline unsigned char get_nmi_reason(void)
+ {
+ return inb(0x61);
diff --git a/tags/2.6.22-2/20008_net-csum.patch1 b/tags/2.6.22-2/20008_net-csum.patch1
new file mode 100644
index 0000000..f52f99b
--- /dev/null
+++ b/tags/2.6.22-2/20008_net-csum.patch1
@@ -0,0 +1,50 @@
+Subject: xen3 TCP/UDP checksumming
+From: http://xenbits.xensource.com/xen-3.1-testing.hg (tip 15042)
+Acked-by: jbeulich@novell.com
+
+This is only a guess, based on suggestions from Keir Fraser.
+
+---
+ net/ipv4/netfilter/nf_nat_proto_tcp.c | 3 +++
+ net/ipv4/netfilter/nf_nat_proto_udp.c | 4 ++++
+ net/ipv4/xfrm4_output.c | 4 ++++
+ 3 files changed, 11 insertions(+)
+
+--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c 2007-08-27 12:09:26.000000000 -0400
++++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c 2007-08-27 14:01:24.000000000 -0400
+@@ -132,6 +132,9 @@ tcp_manip_pkt(struct sk_buff **pskb,
+ if (hdrsize < sizeof(*hdr))
+ return 1;
+
++ if (skb_checksum_setup(*pskb))
++ return 0;
++
+ nf_proto_csum_replace4(&hdr->check, *pskb, oldip, newip, 1);
+ nf_proto_csum_replace2(&hdr->check, *pskb, oldport, newport, 0);
+ return 1;
+--- a/net/ipv4/netfilter/nf_nat_proto_udp.c 2007-08-27 12:09:26.000000000 -0400
++++ b/net/ipv4/netfilter/nf_nat_proto_udp.c 2007-08-27 14:01:24.000000000 -0400
+@@ -116,6 +116,10 @@ udp_manip_pkt(struct sk_buff **pskb,
+ newport = tuple->dst.u.udp.port;
+ portptr = &hdr->dest;
+ }
++
++ if (skb_checksum_setup(*pskb))
++ return 0;
++
+ if (hdr->check || (*pskb)->ip_summed == CHECKSUM_PARTIAL) {
+ nf_proto_csum_replace4(&hdr->check, *pskb, oldip, newip, 1);
+ nf_proto_csum_replace2(&hdr->check, *pskb, *portptr, newport,
+--- a/net/ipv4/xfrm4_output.c 2007-08-27 12:09:26.000000000 -0400
++++ b/net/ipv4/xfrm4_output.c 2007-08-27 14:01:24.000000000 -0400
+@@ -47,6 +47,10 @@ static int xfrm4_output_one(struct sk_bu
+ struct xfrm_state *x = dst->xfrm;
+ int err;
+
++ err = skb_checksum_setup(skb);
++ if (err)
++ goto error_nolock;
++
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ err = skb_checksum_help(skb);
+ if (err)
diff --git a/tags/2.6.22-2/20009_xenoprof-generic.patch1 b/tags/2.6.22-2/20009_xenoprof-generic.patch1
new file mode 100644
index 0000000..5b73392
--- /dev/null
+++ b/tags/2.6.22-2/20009_xenoprof-generic.patch1
@@ -0,0 +1,669 @@
+Subject: Xen oprofile
+From: http://xenbits.xensource.com/xen-3.1-testing.hg (tip 15042)
+Acked-by: jbeulich@novell.com
+
+---
+ drivers/oprofile/buffer_sync.c | 87 ++++++++++++----
+ drivers/oprofile/cpu_buffer.c | 51 +++++++--
+ drivers/oprofile/cpu_buffer.h | 9 +
+ drivers/oprofile/event_buffer.h | 7 +
+ drivers/oprofile/oprof.c | 32 +++++-
+ drivers/oprofile/oprof.h | 3
+ drivers/oprofile/oprofile_files.c | 201 +++++++++++++++++++++++++++++++++++++-
+ include/linux/oprofile.h | 9 +
+ 8 files changed, 360 insertions(+), 39 deletions(-)
+
+--- a/drivers/oprofile/buffer_sync.c 2007-08-27 12:09:26.000000000 -0400
++++ b/drivers/oprofile/buffer_sync.c 2007-08-27 14:02:05.000000000 -0400
+@@ -6,6 +6,10 @@
+ *
+ * @author John Levon <levon@movementarian.org>
+ *
++ * Modified by Aravind Menon for Xen
++ * These modifications are:
++ * Copyright (C) 2005 Hewlett-Packard Co.
++ *
+ * This is the core of the buffer management. Each
+ * CPU buffer is processed and entered into the
+ * global event buffer. Such processing is necessary
+@@ -39,6 +43,7 @@ static cpumask_t marked_cpus = CPU_MASK_
+ static DEFINE_SPINLOCK(task_mortuary);
+ static void process_task_mortuary(void);
+
++static int cpu_current_domain[NR_CPUS];
+
+ /* Take ownership of the task struct and place it on the
+ * list for processing. Only after two full buffer syncs
+@@ -147,6 +152,11 @@ static void end_sync(void)
+ int sync_start(void)
+ {
+ int err;
++ int i;
++
++ for (i = 0; i < NR_CPUS; i++) {
++ cpu_current_domain[i] = COORDINATOR_DOMAIN;
++ }
+
+ start_cpu_work();
+
+@@ -276,15 +286,31 @@ static void add_cpu_switch(int i)
+ last_cookie = INVALID_COOKIE;
+ }
+
+-static void add_kernel_ctx_switch(unsigned int in_kernel)
++static void add_cpu_mode_switch(unsigned int cpu_mode)
+ {
+ add_event_entry(ESCAPE_CODE);
+- if (in_kernel)
+- add_event_entry(KERNEL_ENTER_SWITCH_CODE);
+- else
+- add_event_entry(KERNEL_EXIT_SWITCH_CODE);
++ switch (cpu_mode) {
++ case CPU_MODE_USER:
++ add_event_entry(USER_ENTER_SWITCH_CODE);
++ break;
++ case CPU_MODE_KERNEL:
++ add_event_entry(KERNEL_ENTER_SWITCH_CODE);
++ break;
++ case CPU_MODE_XEN:
++ add_event_entry(XEN_ENTER_SWITCH_CODE);
++ break;
++ default:
++ break;
++ }
+ }
+-
++
++static void add_domain_switch(unsigned long domain_id)
++{
++ add_event_entry(ESCAPE_CODE);
++ add_event_entry(DOMAIN_SWITCH_CODE);
++ add_event_entry(domain_id);
++}
++
+ static void
+ add_user_ctx_switch(struct task_struct const * task, unsigned long cookie)
+ {
+@@ -349,9 +375,9 @@ static int add_us_sample(struct mm_struc
+ * for later lookup from userspace.
+ */
+ static int
+-add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel)
++add_sample(struct mm_struct * mm, struct op_sample * s, int cpu_mode)
+ {
+- if (in_kernel) {
++ if (cpu_mode >= CPU_MODE_KERNEL) {
+ add_sample_entry(s->eip, s->event);
+ return 1;
+ } else if (mm) {
+@@ -497,15 +523,21 @@ void sync_buffer(int cpu)
+ struct mm_struct *mm = NULL;
+ struct task_struct * new;
+ unsigned long cookie = 0;
+- int in_kernel = 1;
++ int cpu_mode = 1;
+ unsigned int i;
+ sync_buffer_state state = sb_buffer_start;
+ unsigned long available;
++ int domain_switch = 0;
+
+ mutex_lock(&buffer_mutex);
+
+ add_cpu_switch(cpu);
+
++ /* We need to assign the first samples in this CPU buffer to the
++ same domain that we were processing at the last sync_buffer */
++ if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) {
++ add_domain_switch(cpu_current_domain[cpu]);
++ }
+ /* Remember, only we can modify tail_pos */
+
+ available = get_slots(cpu_buf);
+@@ -513,16 +545,18 @@ void sync_buffer(int cpu)
+ for (i = 0; i < available; ++i) {
+ struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos];
+
+- if (is_code(s->eip)) {
+- if (s->event <= CPU_IS_KERNEL) {
+- /* kernel/userspace switch */
+- in_kernel = s->event;
++ if (is_code(s->eip) && !domain_switch) {
++ if (s->event <= CPU_MODE_XEN) {
++ /* xen/kernel/userspace switch */
++ cpu_mode = s->event;
+ if (state == sb_buffer_start)
+ state = sb_sample_start;
+- add_kernel_ctx_switch(s->event);
++ add_cpu_mode_switch(s->event);
+ } else if (s->event == CPU_TRACE_BEGIN) {
+ state = sb_bt_start;
+ add_trace_begin();
++ } else if (s->event == CPU_DOMAIN_SWITCH) {
++ domain_switch = 1;
+ } else {
+ struct mm_struct * oldmm = mm;
+
+@@ -536,11 +570,21 @@ void sync_buffer(int cpu)
+ add_user_ctx_switch(new, cookie);
+ }
+ } else {
+- if (state >= sb_bt_start &&
+- !add_sample(mm, s, in_kernel)) {
+- if (state == sb_bt_start) {
+- state = sb_bt_ignore;
+- atomic_inc(&oprofile_stats.bt_lost_no_mapping);
++ if (domain_switch) {
++ cpu_current_domain[cpu] = s->eip;
++ add_domain_switch(s->eip);
++ domain_switch = 0;
++ } else {
++ if (cpu_current_domain[cpu] !=
++ COORDINATOR_DOMAIN) {
++ add_sample_entry(s->eip, s->event);
++ }
++ else if (state >= sb_bt_start &&
++ !add_sample(mm, s, cpu_mode)) {
++ if (state == sb_bt_start) {
++ state = sb_bt_ignore;
++ atomic_inc(&oprofile_stats.bt_lost_no_mapping);
++ }
+ }
+ }
+ }
+@@ -549,6 +593,11 @@ void sync_buffer(int cpu)
+ }
+ release_mm(mm);
+
++ /* We reset domain to COORDINATOR at each CPU switch */
++ if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) {
++ add_domain_switch(COORDINATOR_DOMAIN);
++ }
++
+ mark_done(cpu);
+
+ mutex_unlock(&buffer_mutex);
+--- a/drivers/oprofile/cpu_buffer.c 2007-08-27 12:09:26.000000000 -0400
++++ b/drivers/oprofile/cpu_buffer.c 2007-08-27 14:02:05.000000000 -0400
+@@ -6,6 +6,10 @@
+ *
+ * @author John Levon <levon@movementarian.org>
+ *
++ * Modified by Aravind Menon for Xen
++ * These modifications are:
++ * Copyright (C) 2005 Hewlett-Packard Co.
++ *
+ * Each CPU has a local buffer that stores PC value/event
+ * pairs. We also log context switches when we notice them.
+ * Eventually each CPU's buffer is processed into the global
+@@ -34,6 +38,8 @@ static void wq_sync_buffer(struct work_s
+ #define DEFAULT_TIMER_EXPIRE (HZ / 10)
+ static int work_enabled;
+
++static int32_t current_domain = COORDINATOR_DOMAIN;
++
+ void free_cpu_buffers(void)
+ {
+ int i;
+@@ -57,7 +63,7 @@ int alloc_cpu_buffers(void)
+ goto fail;
+
+ b->last_task = NULL;
+- b->last_is_kernel = -1;
++ b->last_cpu_mode = -1;
+ b->tracing = 0;
+ b->buffer_size = buffer_size;
+ b->tail_pos = 0;
+@@ -113,7 +119,7 @@ void cpu_buffer_reset(struct oprofile_cp
+ * collected will populate the buffer with proper
+ * values to initialize the buffer
+ */
+- cpu_buf->last_is_kernel = -1;
++ cpu_buf->last_cpu_mode = -1;
+ cpu_buf->last_task = NULL;
+ }
+
+@@ -163,13 +169,13 @@ add_code(struct oprofile_cpu_buffer * bu
+ * because of the head/tail separation of the writer and reader
+ * of the CPU buffer.
+ *
+- * is_kernel is needed because on some architectures you cannot
++ * cpu_mode is needed because on some architectures you cannot
+ * tell if you are in kernel or user space simply by looking at
+- * pc. We tag this in the buffer by generating kernel enter/exit
+- * events whenever is_kernel changes
++ * pc. We tag this in the buffer by generating kernel/user (and xen)
++ * enter events whenever cpu_mode changes
+ */
+ static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc,
+- int is_kernel, unsigned long event)
++ int cpu_mode, unsigned long event)
+ {
+ struct task_struct * task;
+
+@@ -180,18 +186,18 @@ static int log_sample(struct oprofile_cp
+ return 0;
+ }
+
+- is_kernel = !!is_kernel;
+-
+ task = current;
+
+ /* notice a switch from user->kernel or vice versa */
+- if (cpu_buf->last_is_kernel != is_kernel) {
+- cpu_buf->last_is_kernel = is_kernel;
+- add_code(cpu_buf, is_kernel);
++ if (cpu_buf->last_cpu_mode != cpu_mode) {
++ cpu_buf->last_cpu_mode = cpu_mode;
++ add_code(cpu_buf, cpu_mode);
+ }
+-
++
+ /* notice a task switch */
+- if (cpu_buf->last_task != task) {
++ /* if not processing other domain samples */
++ if ((cpu_buf->last_task != task) &&
++ (current_domain == COORDINATOR_DOMAIN)) {
+ cpu_buf->last_task = task;
+ add_code(cpu_buf, (unsigned long)task);
+ }
+@@ -275,6 +281,25 @@ void oprofile_add_trace(unsigned long pc
+ add_sample(cpu_buf, pc, 0);
+ }
+
++int oprofile_add_domain_switch(int32_t domain_id)
++{
++ struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
++
++ /* should have space for switching into and out of domain
++ (2 slots each) plus one sample and one cpu mode switch */
++ if (((nr_available_slots(cpu_buf) < 6) &&
++ (domain_id != COORDINATOR_DOMAIN)) ||
++ (nr_available_slots(cpu_buf) < 2))
++ return 0;
++
++ add_code(cpu_buf, CPU_DOMAIN_SWITCH);
++ add_sample(cpu_buf, domain_id, 0);
++
++ current_domain = domain_id;
++
++ return 1;
++}
++
+ /*
+ * This serves to avoid cpu buffer overflow, and makes sure
+ * the task mortuary progresses
+--- a/drivers/oprofile/cpu_buffer.h 2007-08-27 12:09:26.000000000 -0400
++++ b/drivers/oprofile/cpu_buffer.h 2007-08-27 14:01:24.000000000 -0400
+@@ -36,7 +36,7 @@ struct oprofile_cpu_buffer {
+ volatile unsigned long tail_pos;
+ unsigned long buffer_size;
+ struct task_struct * last_task;
+- int last_is_kernel;
++ int last_cpu_mode;
+ int tracing;
+ struct op_sample * buffer;
+ unsigned long sample_received;
+@@ -51,7 +51,10 @@ extern struct oprofile_cpu_buffer cpu_bu
+ void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf);
+
+ /* transient events for the CPU buffer -> event buffer */
+-#define CPU_IS_KERNEL 1
+-#define CPU_TRACE_BEGIN 2
++#define CPU_MODE_USER 0
++#define CPU_MODE_KERNEL 1
++#define CPU_MODE_XEN 2
++#define CPU_TRACE_BEGIN 3
++#define CPU_DOMAIN_SWITCH 4
+
+ #endif /* OPROFILE_CPU_BUFFER_H */
+--- a/drivers/oprofile/event_buffer.h 2007-08-27 12:09:26.000000000 -0400
++++ b/drivers/oprofile/event_buffer.h 2007-08-27 14:01:24.000000000 -0400
+@@ -29,15 +29,20 @@ void wake_up_buffer_waiter(void);
+ #define CPU_SWITCH_CODE 2
+ #define COOKIE_SWITCH_CODE 3
+ #define KERNEL_ENTER_SWITCH_CODE 4
+-#define KERNEL_EXIT_SWITCH_CODE 5
++#define USER_ENTER_SWITCH_CODE 5
+ #define MODULE_LOADED_CODE 6
+ #define CTX_TGID_CODE 7
+ #define TRACE_BEGIN_CODE 8
+ #define TRACE_END_CODE 9
++#define XEN_ENTER_SWITCH_CODE 10
++#define DOMAIN_SWITCH_CODE 11
+
+ #define INVALID_COOKIE ~0UL
+ #define NO_COOKIE 0UL
+
++/* Constant used to refer to coordinator domain (Xen) */
++#define COORDINATOR_DOMAIN -1
++
+ /* add data to the event buffer */
+ void add_event_entry(unsigned long data);
+
+--- a/drivers/oprofile/oprof.c 2007-08-27 12:09:26.000000000 -0400
++++ b/drivers/oprofile/oprof.c 2007-08-27 14:02:05.000000000 -0400
+@@ -5,6 +5,10 @@
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
++ *
++ * Modified by Aravind Menon for Xen
++ * These modifications are:
++ * Copyright (C) 2005 Hewlett-Packard Co.
+ */
+
+ #include <linux/kernel.h>
+@@ -19,7 +23,7 @@
+ #include "cpu_buffer.h"
+ #include "buffer_sync.h"
+ #include "oprofile_stats.h"
+-
++
+ struct oprofile_operations oprofile_ops;
+
+ unsigned long oprofile_started;
+@@ -33,6 +37,32 @@ static DEFINE_MUTEX(start_mutex);
+ */
+ static int timer = 0;
+
++int oprofile_set_active(int active_domains[], unsigned int adomains)
++{
++ int err;
++
++ if (!oprofile_ops.set_active)
++ return -EINVAL;
++
++ mutex_lock(&start_mutex);
++ err = oprofile_ops.set_active(active_domains, adomains);
++ mutex_unlock(&start_mutex);
++ return err;
++}
++
++int oprofile_set_passive(int passive_domains[], unsigned int pdomains)
++{
++ int err;
++
++ if (!oprofile_ops.set_passive)
++ return -EINVAL;
++
++ mutex_lock(&start_mutex);
++ err = oprofile_ops.set_passive(passive_domains, pdomains);
++ mutex_unlock(&start_mutex);
++ return err;
++}
++
+ int oprofile_setup(void)
+ {
+ int err;
+--- a/drivers/oprofile/oprof.h 2007-08-27 12:09:26.000000000 -0400
++++ b/drivers/oprofile/oprof.h 2007-08-27 14:01:24.000000000 -0400
+@@ -35,5 +35,8 @@ void oprofile_create_files(struct super_
+ void oprofile_timer_init(struct oprofile_operations * ops);
+
+ int oprofile_set_backtrace(unsigned long depth);
++
++int oprofile_set_active(int active_domains[], unsigned int adomains);
++int oprofile_set_passive(int passive_domains[], unsigned int pdomains);
+
+ #endif /* OPROF_H */
+--- a/drivers/oprofile/oprofile_files.c 2007-08-27 12:09:26.000000000 -0400
++++ b/drivers/oprofile/oprofile_files.c 2007-08-27 14:02:05.000000000 -0400
+@@ -5,15 +5,21 @@
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
++ *
++ * Modified by Aravind Menon for Xen
++ * These modifications are:
++ * Copyright (C) 2005 Hewlett-Packard Co.
+ */
+
+ #include <linux/fs.h>
+ #include <linux/oprofile.h>
++#include <asm/uaccess.h>
++#include <linux/ctype.h>
+
+ #include "event_buffer.h"
+ #include "oprofile_stats.h"
+ #include "oprof.h"
+-
++
+ unsigned long fs_buffer_size = 131072;
+ unsigned long fs_cpu_buffer_size = 8192;
+ unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */
+@@ -117,11 +123,202 @@ static ssize_t dump_write(struct file *
+ static const struct file_operations dump_fops = {
+ .write = dump_write,
+ };
+-
++
++#define TMPBUFSIZE 512
++
++static unsigned int adomains = 0;
++static int active_domains[MAX_OPROF_DOMAINS + 1];
++static DEFINE_MUTEX(adom_mutex);
++
++static ssize_t adomain_write(struct file * file, char const __user * buf,
++ size_t count, loff_t * offset)
++{
++ char *tmpbuf;
++ char *startp, *endp;
++ int i;
++ unsigned long val;
++ ssize_t retval = count;
++
++ if (*offset)
++ return -EINVAL;
++ if (count > TMPBUFSIZE - 1)
++ return -EINVAL;
++
++ if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
++ return -ENOMEM;
++
++ if (copy_from_user(tmpbuf, buf, count)) {
++ kfree(tmpbuf);
++ return -EFAULT;
++ }
++ tmpbuf[count] = 0;
++
++ mutex_lock(&adom_mutex);
++
++ startp = tmpbuf;
++ /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
++ for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
++ val = simple_strtoul(startp, &endp, 0);
++ if (endp == startp)
++ break;
++ while (ispunct(*endp) || isspace(*endp))
++ endp++;
++ active_domains[i] = val;
++ if (active_domains[i] != val)
++ /* Overflow, force error below */
++ i = MAX_OPROF_DOMAINS + 1;
++ startp = endp;
++ }
++ /* Force error on trailing junk */
++ adomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
++
++ kfree(tmpbuf);
++
++ if (adomains > MAX_OPROF_DOMAINS
++ || oprofile_set_active(active_domains, adomains)) {
++ adomains = 0;
++ retval = -EINVAL;
++ }
++
++ mutex_unlock(&adom_mutex);
++ return retval;
++}
++
++static ssize_t adomain_read(struct file * file, char __user * buf,
++ size_t count, loff_t * offset)
++{
++ char * tmpbuf;
++ size_t len;
++ int i;
++ ssize_t retval;
++
++ if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
++ return -ENOMEM;
++
++ mutex_lock(&adom_mutex);
++
++ len = 0;
++ for (i = 0; i < adomains; i++)
++ len += snprintf(tmpbuf + len,
++ len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
++ "%u ", active_domains[i]);
++ WARN_ON(len > TMPBUFSIZE);
++ if (len != 0 && len <= TMPBUFSIZE)
++ tmpbuf[len-1] = '\n';
++
++ mutex_unlock(&adom_mutex);
++
++ retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
++
++ kfree(tmpbuf);
++ return retval;
++}
++
++
++static struct file_operations active_domain_ops = {
++ .read = adomain_read,
++ .write = adomain_write,
++};
++
++static unsigned int pdomains = 0;
++static int passive_domains[MAX_OPROF_DOMAINS];
++static DEFINE_MUTEX(pdom_mutex);
++
++static ssize_t pdomain_write(struct file * file, char const __user * buf,
++ size_t count, loff_t * offset)
++{
++ char *tmpbuf;
++ char *startp, *endp;
++ int i;
++ unsigned long val;
++ ssize_t retval = count;
++
++ if (*offset)
++ return -EINVAL;
++ if (count > TMPBUFSIZE - 1)
++ return -EINVAL;
++
++ if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
++ return -ENOMEM;
++
++ if (copy_from_user(tmpbuf, buf, count)) {
++ kfree(tmpbuf);
++ return -EFAULT;
++ }
++ tmpbuf[count] = 0;
++
++ mutex_lock(&pdom_mutex);
++
++ startp = tmpbuf;
++ /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
++ for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
++ val = simple_strtoul(startp, &endp, 0);
++ if (endp == startp)
++ break;
++ while (ispunct(*endp) || isspace(*endp))
++ endp++;
++ passive_domains[i] = val;
++ if (passive_domains[i] != val)
++ /* Overflow, force error below */
++ i = MAX_OPROF_DOMAINS + 1;
++ startp = endp;
++ }
++ /* Force error on trailing junk */
++ pdomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
++
++ kfree(tmpbuf);
++
++ if (pdomains > MAX_OPROF_DOMAINS
++ || oprofile_set_passive(passive_domains, pdomains)) {
++ pdomains = 0;
++ retval = -EINVAL;
++ }
++
++ mutex_unlock(&pdom_mutex);
++ return retval;
++}
++
++static ssize_t pdomain_read(struct file * file, char __user * buf,
++ size_t count, loff_t * offset)
++{
++ char * tmpbuf;
++ size_t len;
++ int i;
++ ssize_t retval;
++
++ if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
++ return -ENOMEM;
++
++ mutex_lock(&pdom_mutex);
++
++ len = 0;
++ for (i = 0; i < pdomains; i++)
++ len += snprintf(tmpbuf + len,
++ len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
++ "%u ", passive_domains[i]);
++ WARN_ON(len > TMPBUFSIZE);
++ if (len != 0 && len <= TMPBUFSIZE)
++ tmpbuf[len-1] = '\n';
++
++ mutex_unlock(&pdom_mutex);
++
++ retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
++
++ kfree(tmpbuf);
++ return retval;
++}
++
++static struct file_operations passive_domain_ops = {
++ .read = pdomain_read,
++ .write = pdomain_write,
++};
++
+ void oprofile_create_files(struct super_block * sb, struct dentry * root)
+ {
+ oprofilefs_create_file(sb, root, "enable", &enable_fops);
+ oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666);
++ oprofilefs_create_file(sb, root, "active_domains", &active_domain_ops);
++ oprofilefs_create_file(sb, root, "passive_domains", &passive_domain_ops);
+ oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops);
+ oprofilefs_create_ulong(sb, root, "buffer_size", &fs_buffer_size);
+ oprofilefs_create_ulong(sb, root, "buffer_watershed", &fs_buffer_watershed);
+--- a/include/linux/oprofile.h 2007-08-27 12:09:26.000000000 -0400
++++ b/include/linux/oprofile.h 2007-08-27 14:02:05.000000000 -0400
+@@ -16,6 +16,8 @@
+ #include <linux/types.h>
+ #include <linux/spinlock.h>
+ #include <asm/atomic.h>
++
++#include <xen/interface/xenoprof.h>
+
+ struct super_block;
+ struct dentry;
+@@ -27,6 +29,11 @@ struct oprofile_operations {
+ /* create any necessary configuration files in the oprofile fs.
+ * Optional. */
+ int (*create_files)(struct super_block * sb, struct dentry * root);
++ /* setup active domains with Xen */
++ int (*set_active)(int *active_domains, unsigned int adomains);
++ /* setup passive domains with Xen */
++ int (*set_passive)(int *passive_domains, unsigned int pdomains);
++
+ /* Do any necessary interrupt setup. Optional. */
+ int (*setup)(void);
+ /* Do any necessary interrupt shutdown. Optional. */
+@@ -78,6 +85,8 @@ void oprofile_add_pc(unsigned long pc, i
+ /* add a backtrace entry, to be called from the ->backtrace callback */
+ void oprofile_add_trace(unsigned long eip);
+
++/* add a domain switch entry */
++int oprofile_add_domain_switch(int32_t domain_id);
+
+ /**
+ * Create a file of the given name as a child of the given root, with
diff --git a/tags/2.6.22-2/20010_softlockup-no-idle-hz.patch1 b/tags/2.6.22-2/20010_softlockup-no-idle-hz.patch1
new file mode 100644
index 0000000..e67c89c
--- /dev/null
+++ b/tags/2.6.22-2/20010_softlockup-no-idle-hz.patch1
@@ -0,0 +1,75 @@
+Subject: xen3 softlockup - no-idle-hz interaction fix
+From: http://xenbits.xensource.com/xen-3.1-testing.hg (tip 15042)
+Acked-by: jbeulich@novell.com
+
+Index: head-2007-09-25/include/linux/sched.h
+===================================================================
+--- head-2007-09-25.orig/include/linux/sched.h 2007-09-25 14:22:37.000000000 +0200
++++ head-2007-09-25/include/linux/sched.h 2007-09-25 14:32:18.000000000 +0200
+@@ -236,11 +236,16 @@ extern void update_process_times(int use
+ extern void scheduler_tick(void);
+
+ #ifdef CONFIG_DETECT_SOFTLOCKUP
++extern unsigned long softlockup_get_next_event(void);
+ extern void softlockup_tick(void);
+ extern void spawn_softlockup_task(void);
+ extern void touch_softlockup_watchdog(void);
+ extern void touch_all_softlockup_watchdogs(void);
+ #else
++static inline unsigned long softlockup_get_next_event(void)
++{
++ return MAX_JIFFY_OFFSET;
++}
+ static inline void softlockup_tick(void)
+ {
+ }
+Index: head-2007-09-25/kernel/softlockup.c
+===================================================================
+--- head-2007-09-25.orig/kernel/softlockup.c 2007-09-25 14:22:37.000000000 +0200
++++ head-2007-09-25/kernel/softlockup.c 2007-09-25 14:32:18.000000000 +0200
+@@ -60,6 +60,19 @@ void touch_all_softlockup_watchdogs(void
+ }
+ EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
+
++unsigned long softlockup_get_next_event(void)
++{
++ int this_cpu = smp_processor_id();
++ unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
++
++ if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
++ did_panic ||
++ !per_cpu(watchdog_task, this_cpu))
++ return MAX_JIFFY_OFFSET;
++
++ return max_t(long, 0, touch_timestamp + HZ - jiffies);
++}
++
+ /*
+ * This callback runs from the timer interrupt, and checks
+ * whether the watchdog thread has hung or not:
+Index: head-2007-09-25/kernel/timer.c
+===================================================================
+--- head-2007-09-25.orig/kernel/timer.c 2007-09-25 14:22:37.000000000 +0200
++++ head-2007-09-25/kernel/timer.c 2007-09-25 14:32:18.000000000 +0200
+@@ -781,7 +781,7 @@ static unsigned long cmp_next_hrtimer_ev
+ unsigned long get_next_timer_interrupt(unsigned long now)
+ {
+ tvec_base_t *base = __get_cpu_var(tvec_bases);
+- unsigned long expires;
++ unsigned long expires, sl_next;
+
+ spin_lock(&base->lock);
+ expires = __next_timer_interrupt(base);
+@@ -790,7 +790,11 @@ unsigned long get_next_timer_interrupt(u
+ if (time_before_eq(expires, now))
+ return now;
+
+- return cmp_next_hrtimer_event(now, expires);
++ expires = cmp_next_hrtimer_event(now, expires);
++ sl_next = softlockup_get_next_event();
++
++ return expires <= now || expires - now < sl_next
++ ? expires : now + sl_next;
+ }
+
+ #ifdef CONFIG_NO_IDLE_HZ
diff --git a/tags/2.6.22-2/20011_xen3-auto-xen-arch.patch1 b/tags/2.6.22-2/20011_xen3-auto-xen-arch.patch1
new file mode 100644
index 0000000..0252943
--- /dev/null
+++ b/tags/2.6.22-2/20011_xen3-auto-xen-arch.patch1
@@ -0,0 +1,49825 @@
+Subject: xen3 xen-arch
+From: http://xenbits.xensource.com/xen-3.1-testing.hg (tip 15042)
+Patch-mainline: obsolete
+Acked-by: jbeulich@novell.com
+
+List of files having Xen derivates (perhaps created during the merging
+of newer kernel versions), for xen-port-patches.py to pick up (i.e. this
+must be retained here until the XenSource tree has picked up the
+respective kernel.org version):
+
++++ linux/arch/i386/kernel/e820-xen.c
++++ linux/drivers/char/mem-xen.c
++++ linux/lib/swiotlb-xen.c
+
+---
+ arch/i386/boot-xen/Makefile | 21
+ arch/i386/kernel/acpi/boot-xen.c | 1168 ++++++++
+ arch/i386/kernel/apic-xen.c | 155 +
+ arch/i386/kernel/cpu/common-xen.c | 743 +++++
+ arch/i386/kernel/cpu/mtrr/main-xen.c | 197 +
+ arch/i386/kernel/early_printk-xen.c | 2
+ arch/i386/kernel/entry-xen.S | 1216 ++++++++
+ arch/i386/kernel/fixup.c | 88
+ arch/i386/kernel/head-xen.S | 207 +
+ arch/i386/kernel/init_task-xen.c | 51
+ arch/i386/kernel/io_apic-xen.c | 2777 ++++++++++++++++++++
+ arch/i386/kernel/ioport-xen.c | 122
+ arch/i386/kernel/irq-xen.c | 324 ++
+ arch/i386/kernel/ldt-xen.c | 270 +
+ arch/i386/kernel/microcode-xen.c | 144 +
+ arch/i386/kernel/mpparse-xen.c | 1185 ++++++++
+ arch/i386/kernel/pci-dma-xen.c | 366 ++
+ arch/i386/kernel/process-xen.c | 853 ++++++
+ arch/i386/kernel/quirks-xen.c | 47
+ arch/i386/kernel/setup-xen.c | 1871 +++++++++++++
+ arch/i386/kernel/smp-xen.c | 624 ++++
+ arch/i386/kernel/swiotlb.c | 716 +++++
+ arch/i386/kernel/time-xen.c | 1141 ++++++++
+ arch/i386/kernel/traps-xen.c | 1186 ++++++++
+ arch/i386/kernel/vsyscall-note-xen.S | 32
+ arch/i386/mach-xen/Makefile | 5
+ arch/i386/mach-xen/setup.c | 147 +
+ arch/i386/mm/fault-xen.c | 769 +++++
+ arch/i386/mm/highmem-xen.c | 136
+ arch/i386/mm/hypervisor.c | 451 +++
+ arch/i386/mm/init-xen.c | 850 ++++++
+ arch/i386/mm/ioremap-xen.c | 443 +++
+ arch/i386/mm/pgtable-xen.c | 727 +++++
+ arch/i386/oprofile/xenoprof.c | 179 +
+ arch/i386/pci/irq-xen.c | 1205 ++++++++
+ arch/i386/pci/pcifront.c | 55
+ arch/x86_64/ia32/ia32entry-xen.S | 743 +++++
+ arch/x86_64/ia32/syscall32-xen.c | 128
+ arch/x86_64/ia32/syscall32_syscall-xen.S | 28
+ arch/x86_64/ia32/vsyscall-int80.S | 58
+ arch/x86_64/kernel/apic-xen.c | 197 +
+ arch/x86_64/kernel/e820-xen.c | 774 +++++
+ arch/x86_64/kernel/early_printk-xen.c | 302 ++
+ arch/x86_64/kernel/entry-xen.S | 1325 +++++++++
+ arch/x86_64/kernel/genapic-xen.c | 143 +
+ arch/x86_64/kernel/genapic_xen.c | 161 +
+ arch/x86_64/kernel/head-xen.S | 203 +
+ arch/x86_64/kernel/head64-xen.c | 162 +
+ arch/x86_64/kernel/io_apic-xen.c | 2269 ++++++++++++++++
+ arch/x86_64/kernel/ioport-xen.c | 99
+ arch/x86_64/kernel/irq-xen.c | 197 +
+ arch/x86_64/kernel/ldt-xen.c | 282 ++
+ arch/x86_64/kernel/mpparse-xen.c | 1011 +++++++
+ arch/x86_64/kernel/pci-swiotlb-xen.c | 55
+ arch/x86_64/kernel/process-xen.c | 829 +++++
+ arch/x86_64/kernel/setup-xen.c | 1650 +++++++++++
+ arch/x86_64/kernel/setup64-xen.c | 361 ++
+ arch/x86_64/kernel/smp-xen.c | 600 ++++
+ arch/x86_64/kernel/traps-xen.c | 1175 ++++++++
+ arch/x86_64/kernel/vsyscall-xen.c | 239 +
+ arch/x86_64/kernel/xen_entry.S | 40
+ arch/x86_64/mm/fault-xen.c | 724 +++++
+ arch/x86_64/mm/init-xen.c | 1241 ++++++++
+ arch/x86_64/mm/pageattr-xen.c | 433 +++
+ include/asm-i386/mach-xen/asm/agp.h | 37
+ include/asm-i386/mach-xen/asm/desc.h | 164 +
+ include/asm-i386/mach-xen/asm/dma-mapping.h | 157 +
+ include/asm-i386/mach-xen/asm/fixmap.h | 155 +
+ include/asm-i386/mach-xen/asm/floppy.h | 147 +
+ include/asm-i386/mach-xen/asm/highmem.h | 80
+ include/asm-i386/mach-xen/asm/hw_irq.h | 72
+ include/asm-i386/mach-xen/asm/hypercall.h | 407 ++
+ include/asm-i386/mach-xen/asm/hypervisor.h | 258 +
+ include/asm-i386/mach-xen/asm/io.h | 390 ++
+ include/asm-i386/mach-xen/asm/irqflags.h | 127
+ include/asm-i386/mach-xen/asm/maddr.h | 193 +
+ include/asm-i386/mach-xen/asm/mmu.h | 29
+ include/asm-i386/mach-xen/asm/mmu_context.h | 108
+ include/asm-i386/mach-xen/asm/page.h | 229 +
+ include/asm-i386/mach-xen/asm/param.h | 23
+ include/asm-i386/mach-xen/asm/pci.h | 146 +
+ include/asm-i386/mach-xen/asm/pgalloc.h | 59
+ include/asm-i386/mach-xen/asm/pgtable-2level-defs.h | 20
+ include/asm-i386/mach-xen/asm/pgtable-2level.h | 118
+ include/asm-i386/mach-xen/asm/pgtable-3level-defs.h | 24
+ include/asm-i386/mach-xen/asm/pgtable-3level.h | 206 +
+ include/asm-i386/mach-xen/asm/pgtable.h | 530 +++
+ include/asm-i386/mach-xen/asm/processor.h | 741 +++++
+ include/asm-i386/mach-xen/asm/ptrace.h | 90
+ include/asm-i386/mach-xen/asm/scatterlist.h | 22
+ include/asm-i386/mach-xen/asm/segment.h | 117
+ include/asm-i386/mach-xen/asm/setup.h | 81
+ include/asm-i386/mach-xen/asm/smp.h | 103
+ include/asm-i386/mach-xen/asm/spinlock.h | 202 +
+ include/asm-i386/mach-xen/asm/swiotlb.h | 43
+ include/asm-i386/mach-xen/asm/synch_bitops.h | 145 +
+ include/asm-i386/mach-xen/asm/system.h | 488 +++
+ include/asm-i386/mach-xen/asm/tlbflush.h | 101
+ include/asm-i386/mach-xen/asm/vga.h | 20
+ include/asm-i386/mach-xen/asm/xenoprof.h | 48
+ include/asm-i386/mach-xen/irq_vectors.h | 125
+ include/asm-i386/mach-xen/mach_traps.h | 33
+ include/asm-i386/mach-xen/setup_arch.h | 5
+ include/asm-x86_64/mach-xen/asm/agp.h | 35
+ include/asm-x86_64/mach-xen/asm/arch_hooks.h | 27
+ include/asm-x86_64/mach-xen/asm/bootsetup.h | 42
+ include/asm-x86_64/mach-xen/asm/desc.h | 263 +
+ include/asm-x86_64/mach-xen/asm/dma-mapping.h | 207 +
+ include/asm-x86_64/mach-xen/asm/e820.h | 66
+ include/asm-x86_64/mach-xen/asm/fixmap.h | 112
+ include/asm-x86_64/mach-xen/asm/floppy.h | 206 +
+ include/asm-x86_64/mach-xen/asm/hw_irq.h | 136
+ include/asm-x86_64/mach-xen/asm/hypercall.h | 406 ++
+ include/asm-x86_64/mach-xen/asm/hypervisor.h | 2
+ include/asm-x86_64/mach-xen/asm/io.h | 330 ++
+ include/asm-x86_64/mach-xen/asm/irq.h | 38
+ include/asm-x86_64/mach-xen/asm/irqflags.h | 139 +
+ include/asm-x86_64/mach-xen/asm/maddr.h | 161 +
+ include/asm-x86_64/mach-xen/asm/mmu.h | 38
+ include/asm-x86_64/mach-xen/asm/mmu_context.h | 136
+ include/asm-x86_64/mach-xen/asm/msr.h | 399 ++
+ include/asm-x86_64/mach-xen/asm/nmi.h | 93
+ include/asm-x86_64/mach-xen/asm/page.h | 214 +
+ include/asm-x86_64/mach-xen/asm/pci.h | 166 +
+ include/asm-x86_64/mach-xen/asm/pgalloc.h | 204 +
+ include/asm-x86_64/mach-xen/asm/pgtable.h | 573 ++++
+ include/asm-x86_64/mach-xen/asm/processor.h | 506 +++
+ include/asm-x86_64/mach-xen/asm/ptrace.h | 127
+ include/asm-x86_64/mach-xen/asm/smp.h | 150 +
+ include/asm-x86_64/mach-xen/asm/synch_bitops.h | 2
+ include/asm-x86_64/mach-xen/asm/system.h | 262 +
+ include/asm-x86_64/mach-xen/asm/timer.h | 67
+ include/asm-x86_64/mach-xen/asm/tlbflush.h | 103
+ include/asm-x86_64/mach-xen/asm/vga.h | 20
+ include/asm-x86_64/mach-xen/asm/xenoprof.h | 1
+ include/asm-x86_64/mach-xen/asm/xor.h | 328 ++
+ include/asm-x86_64/mach-xen/irq_vectors.h | 123
+ include/asm-x86_64/mach-xen/mach_time.h | 111
+ include/asm-x86_64/mach-xen/mach_timer.h | 50
+ include/asm-x86_64/mach-xen/setup_arch_post.h | 63
+ include/asm-x86_64/mach-xen/setup_arch_pre.h | 5
+ include/xen/balloon.h | 57
+ include/xen/blkif.h | 97
+ include/xen/cpu_hotplug.h | 44
+ include/xen/driver_util.h | 14
+ include/xen/evtchn.h | 126
+ include/xen/features.h | 20
+ include/xen/gnttab.h | 138
+ include/xen/hvm.h | 23
+ include/xen/hypercall.h | 24
+ include/xen/hypervisor_sysfs.h | 32
+ include/xen/pcifront.h | 76
+ include/xen/public/evtchn.h | 88
+ include/xen/public/gntdev.h | 105
+ include/xen/public/privcmd.h | 79
+ include/xen/xen_proc.h | 12
+ include/xen/xenbus.h | 302 ++
+ include/xen/xencons.h | 19
+ include/xen/xenoprof.h | 42
+ scripts/Makefile.xen | 14
+ 160 files changed, 49168 insertions(+)
+
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/boot-xen/Makefile 2007-08-27 14:01:24.000000000 -0400
+@@ -0,0 +1,21 @@
++
++OBJCOPYFLAGS := -g --strip-unneeded
++
++vmlinuz: vmlinux-stripped FORCE
++ $(call if_changed,gzip)
++
++vmlinux-stripped: vmlinux FORCE
++ $(call if_changed,objcopy)
++
++INSTALL_ROOT := $(patsubst %/boot,%,$(INSTALL_PATH))
++
++XINSTALL_NAME ?= $(KERNELRELEASE)
++install:
++ mkdir -p $(INSTALL_ROOT)/boot
++ ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
++ rm -f $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
++ install -m0644 vmlinuz $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
++ install -m0644 vmlinux $(INSTALL_ROOT)/boot/vmlinux-syms-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
++ install -m0664 .config $(INSTALL_ROOT)/boot/config-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
++ install -m0664 System.map $(INSTALL_ROOT)/boot/System.map-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
++ ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/acpi/boot-xen.c 2007-08-27 14:02:03.000000000 -0400
+@@ -0,0 +1,1168 @@
++/*
++ * boot.c - Architecture-Specific Low-Level ACPI Boot Support
++ *
++ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
++ * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ */
++
++#include <linux/init.h>
++#include <linux/acpi.h>
++#include <linux/efi.h>
++#include <linux/module.h>
++#include <linux/dmi.h>
++#include <linux/irq.h>
++
++#include <asm/pgtable.h>
++#include <asm/io_apic.h>
++#include <asm/apic.h>
++#include <asm/io.h>
++#include <asm/mpspec.h>
++
++#ifdef CONFIG_X86_64
++
++extern void __init clustered_apic_check(void);
++
++extern int gsi_irq_sharing(int gsi);
++#include <asm/proto.h>
++
++static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
++
++
++#else /* X86 */
++
++#ifdef CONFIG_X86_LOCAL_APIC
++#include <mach_apic.h>
++#include <mach_mpparse.h>
++#endif /* CONFIG_X86_LOCAL_APIC */
++
++static inline int gsi_irq_sharing(int gsi) { return gsi; }
++
++#endif /* X86 */
++
++#define BAD_MADT_ENTRY(entry, end) ( \
++ (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
++ ((acpi_table_entry_header *)entry)->length < sizeof(*entry))
++
++#define PREFIX "ACPI: "
++
++int acpi_noirq __initdata; /* skip ACPI IRQ initialization */
++int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */
++int acpi_ht __initdata = 1; /* enable HT */
++
++int acpi_lapic;
++int acpi_ioapic;
++int acpi_strict;
++EXPORT_SYMBOL(acpi_strict);
++
++acpi_interrupt_flags acpi_sci_flags __initdata;
++int acpi_sci_override_gsi __initdata;
++int acpi_skip_timer_override __initdata;
++
++#ifdef CONFIG_X86_LOCAL_APIC
++static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
++#endif
++
++#ifndef __HAVE_ARCH_CMPXCHG
++#warning ACPI uses CMPXCHG, i486 and later hardware
++#endif
++
++#define MAX_MADT_ENTRIES 256
++u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] =
++ {[0 ... MAX_MADT_ENTRIES - 1] = 0xff };
++EXPORT_SYMBOL(x86_acpiid_to_apicid);
++
++/* --------------------------------------------------------------------------
++ Boot-time Configuration
++ -------------------------------------------------------------------------- */
++
++/*
++ * The default interrupt routing model is PIC (8259). This gets
++ * overriden if IOAPICs are enumerated (below).
++ */
++enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
++
++#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
++
++/* rely on all ACPI tables being in the direct mapping */
++char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
++{
++ if (!phys_addr || !size)
++ return NULL;
++
++ if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE)
++ return __va(phys_addr);
++
++ return NULL;
++}
++
++#else
++
++/*
++ * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
++ * to map the target physical address. The problem is that set_fixmap()
++ * provides a single page, and it is possible that the page is not
++ * sufficient.
++ * By using this area, we can map up to MAX_IO_APICS pages temporarily,
++ * i.e. until the next __va_range() call.
++ *
++ * Important Safety Note: The fixed I/O APIC page numbers are *subtracted*
++ * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and
++ * count idx down while incrementing the phys address.
++ */
++char *__acpi_map_table(unsigned long phys, unsigned long size)
++{
++ unsigned long base, offset, mapped_size;
++ int idx;
++
++#ifndef CONFIG_XEN
++ if (phys + size < 8 * 1024 * 1024)
++ return __va(phys);
++#endif
++
++ offset = phys & (PAGE_SIZE - 1);
++ mapped_size = PAGE_SIZE - offset;
++ set_fixmap(FIX_ACPI_END, phys);
++ base = fix_to_virt(FIX_ACPI_END);
++
++ /*
++ * Most cases can be covered by the below.
++ */
++ idx = FIX_ACPI_END;
++ while (mapped_size < size) {
++ if (--idx < FIX_ACPI_BEGIN)
++ return NULL; /* cannot handle this */
++ phys += PAGE_SIZE;
++ set_fixmap(idx, phys);
++ mapped_size += PAGE_SIZE;
++ }
++
++ return ((unsigned char *)base + offset);
++}
++#endif
++
++#ifdef CONFIG_PCI_MMCONFIG
++/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
++struct acpi_table_mcfg_config *pci_mmcfg_config;
++int pci_mmcfg_config_num;
++
++int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
++{
++ struct acpi_table_mcfg *mcfg;
++ unsigned long i;
++ int config_size;
++
++ if (!phys_addr || !size)
++ return -EINVAL;
++
++ mcfg = (struct acpi_table_mcfg *)__acpi_map_table(phys_addr, size);
++ if (!mcfg) {
++ printk(KERN_WARNING PREFIX "Unable to map MCFG\n");
++ return -ENODEV;
++ }
++
++ /* how many config structures do we have */
++ pci_mmcfg_config_num = 0;
++ i = size - sizeof(struct acpi_table_mcfg);
++ while (i >= sizeof(struct acpi_table_mcfg_config)) {
++ ++pci_mmcfg_config_num;
++ i -= sizeof(struct acpi_table_mcfg_config);
++ };
++ if (pci_mmcfg_config_num == 0) {
++ printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
++ return -ENODEV;
++ }
++
++ config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
++ pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
++ if (!pci_mmcfg_config) {
++ printk(KERN_WARNING PREFIX
++ "No memory for MCFG config tables\n");
++ return -ENOMEM;
++ }
++
++ memcpy(pci_mmcfg_config, &mcfg->config, config_size);
++ for (i = 0; i < pci_mmcfg_config_num; ++i) {
++ if (mcfg->config[i].base_reserved) {
++ printk(KERN_ERR PREFIX
++ "MMCONFIG not in low 4GB of memory\n");
++ kfree(pci_mmcfg_config);
++ pci_mmcfg_config_num = 0;
++ return -ENODEV;
++ }
++ }
++
++ return 0;
++}
++#endif /* CONFIG_PCI_MMCONFIG */
++
++#ifdef CONFIG_X86_LOCAL_APIC
++static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
++{
++ struct acpi_table_madt *madt = NULL;
++
++ if (!phys_addr || !size || !cpu_has_apic)
++ return -EINVAL;
++
++ madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size);
++ if (!madt) {
++ printk(KERN_WARNING PREFIX "Unable to map MADT\n");
++ return -ENODEV;
++ }
++
++ if (madt->lapic_address) {
++ acpi_lapic_addr = (u64) madt->lapic_address;
++
++ printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
++ madt->lapic_address);
++ }
++
++ acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
++
++ return 0;
++}
++
++static int __init
++acpi_parse_lapic(acpi_table_entry_header * header, const unsigned long end)
++{
++ struct acpi_table_lapic *processor = NULL;
++
++ processor = (struct acpi_table_lapic *)header;
++
++ if (BAD_MADT_ENTRY(processor, end))
++ return -EINVAL;
++
++ acpi_table_print_madt_entry(header);
++
++ /* Record local apic id only when enabled */
++ if (processor->flags.enabled)
++ x86_acpiid_to_apicid[processor->acpi_id] = processor->id;
++
++ /*
++ * We need to register disabled CPU as well to permit
++ * counting disabled CPUs. This allows us to size
++ * cpus_possible_map more accurately, to permit
++ * to not preallocating memory for all NR_CPUS
++ * when we use CPU hotplug.
++ */
++ mp_register_lapic(processor->id, /* APIC ID */
++ processor->flags.enabled); /* Enabled? */
++
++ return 0;
++}
++
++static int __init
++acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header,
++ const unsigned long end)
++{
++ struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL;
++
++ lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr *)header;
++
++ if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
++ return -EINVAL;
++
++ acpi_lapic_addr = lapic_addr_ovr->address;
++
++ return 0;
++}
++
++static int __init
++acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end)
++{
++ struct acpi_table_lapic_nmi *lapic_nmi = NULL;
++
++ lapic_nmi = (struct acpi_table_lapic_nmi *)header;
++
++ if (BAD_MADT_ENTRY(lapic_nmi, end))
++ return -EINVAL;
++
++ acpi_table_print_madt_entry(header);
++
++ if (lapic_nmi->lint != 1)
++ printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
++
++ return 0;
++}
++
++#endif /*CONFIG_X86_LOCAL_APIC */
++
++#ifdef CONFIG_X86_IO_APIC
++
++static int __init
++acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end)
++{
++ struct acpi_table_ioapic *ioapic = NULL;
++
++ ioapic = (struct acpi_table_ioapic *)header;
++
++ if (BAD_MADT_ENTRY(ioapic, end))
++ return -EINVAL;
++
++ acpi_table_print_madt_entry(header);
++
++ mp_register_ioapic(ioapic->id,
++ ioapic->address, ioapic->global_irq_base);
++
++ return 0;
++}
++
++/*
++ * Parse Interrupt Source Override for the ACPI SCI
++ */
++static void acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
++{
++ if (trigger == 0) /* compatible SCI trigger is level */
++ trigger = 3;
++
++ if (polarity == 0) /* compatible SCI polarity is low */
++ polarity = 3;
++
++ /* Command-line over-ride via acpi_sci= */
++ if (acpi_sci_flags.trigger)
++ trigger = acpi_sci_flags.trigger;
++
++ if (acpi_sci_flags.polarity)
++ polarity = acpi_sci_flags.polarity;
++
++ /*
++ * mp_config_acpi_legacy_irqs() already setup IRQs < 16
++ * If GSI is < 16, this will update its flags,
++ * else it will create a new mp_irqs[] entry.
++ */
++ mp_override_legacy_irq(gsi, polarity, trigger, gsi);
++
++ /*
++ * stash over-ride to indicate we've been here
++ * and for later update of acpi_fadt
++ */
++ acpi_sci_override_gsi = gsi;
++ return;
++}
++
++static int __init
++acpi_parse_int_src_ovr(acpi_table_entry_header * header,
++ const unsigned long end)
++{
++ struct acpi_table_int_src_ovr *intsrc = NULL;
++
++ intsrc = (struct acpi_table_int_src_ovr *)header;
++
++ if (BAD_MADT_ENTRY(intsrc, end))
++ return -EINVAL;
++
++ acpi_table_print_madt_entry(header);
++
++ if (intsrc->bus_irq == acpi_fadt.sci_int) {
++ acpi_sci_ioapic_setup(intsrc->global_irq,
++ intsrc->flags.polarity,
++ intsrc->flags.trigger);
++ return 0;
++ }
++
++ if (acpi_skip_timer_override &&
++ intsrc->bus_irq == 0 && intsrc->global_irq == 2) {
++ printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
++ return 0;
++ }
++
++ mp_override_legacy_irq(intsrc->bus_irq,
++ intsrc->flags.polarity,
++ intsrc->flags.trigger, intsrc->global_irq);
++
++ return 0;
++}
++
++static int __init
++acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end)
++{
++ struct acpi_table_nmi_src *nmi_src = NULL;
++
++ nmi_src = (struct acpi_table_nmi_src *)header;
++
++ if (BAD_MADT_ENTRY(nmi_src, end))
++ return -EINVAL;
++
++ acpi_table_print_madt_entry(header);
++
++ /* TBD: Support nimsrc entries? */
++
++ return 0;
++}
++
++#endif /* CONFIG_X86_IO_APIC */
++
++/*
++ * acpi_pic_sci_set_trigger()
++ *
++ * use ELCR to set PIC-mode trigger type for SCI
++ *
++ * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
++ * it may require Edge Trigger -- use "acpi_sci=edge"
++ *
++ * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
++ * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge.
++ * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0)
++ * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0)
++ */
++
++void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
++{
++ unsigned int mask = 1 << irq;
++ unsigned int old, new;
++
++ /* Real old ELCR mask */
++ old = inb(0x4d0) | (inb(0x4d1) << 8);
++
++ /*
++ * If we use ACPI to set PCI irq's, then we should clear ELCR
++ * since we will set it correctly as we enable the PCI irq
++ * routing.
++ */
++ new = acpi_noirq ? old : 0;
++
++ /*
++ * Update SCI information in the ELCR, it isn't in the PCI
++ * routing tables..
++ */
++ switch (trigger) {
++ case 1: /* Edge - clear */
++ new &= ~mask;
++ break;
++ case 3: /* Level - set */
++ new |= mask;
++ break;
++ }
++
++ if (old == new)
++ return;
++
++ printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
++ outb(new, 0x4d0);
++ outb(new >> 8, 0x4d1);
++}
++
++int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
++{
++#ifdef CONFIG_X86_IO_APIC
++ if (use_pci_vector() && !platform_legacy_irq(gsi))
++ *irq = IO_APIC_VECTOR(gsi);
++ else
++#endif
++ *irq = gsi_irq_sharing(gsi);
++ return 0;
++}
++
++/*
++ * success: return IRQ number (>=0)
++ * failure: return < 0
++ */
++int acpi_register_gsi(u32 gsi, int triggering, int polarity)
++{
++ unsigned int irq;
++ unsigned int plat_gsi = gsi;
++
++#ifdef CONFIG_PCI
++ /*
++ * Make sure all (legacy) PCI IRQs are set as level-triggered.
++ */
++ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
++ extern void eisa_set_level_irq(unsigned int irq);
++
++ if (triggering == ACPI_LEVEL_SENSITIVE)
++ eisa_set_level_irq(gsi);
++ }
++#endif
++
++#ifdef CONFIG_X86_IO_APIC
++ if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
++ plat_gsi = mp_register_gsi(gsi, triggering, polarity);
++ }
++#endif
++ acpi_gsi_to_irq(plat_gsi, &irq);
++ return irq;
++}
++
++EXPORT_SYMBOL(acpi_register_gsi);
++
++/*
++ * ACPI based hotplug support for CPU
++ */
++#ifdef CONFIG_ACPI_HOTPLUG_CPU
++int acpi_map_lsapic(acpi_handle handle, int *pcpu)
++{
++ /* TBD */
++ return -EINVAL;
++}
++
++EXPORT_SYMBOL(acpi_map_lsapic);
++
++int acpi_unmap_lsapic(int cpu)
++{
++ /* TBD */
++ return -EINVAL;
++}
++
++EXPORT_SYMBOL(acpi_unmap_lsapic);
++#endif /* CONFIG_ACPI_HOTPLUG_CPU */
++
++int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
++{
++ /* TBD */
++ return -EINVAL;
++}
++
++EXPORT_SYMBOL(acpi_register_ioapic);
++
++int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
++{
++ /* TBD */
++ return -EINVAL;
++}
++
++EXPORT_SYMBOL(acpi_unregister_ioapic);
++
++static unsigned long __init
++acpi_scan_rsdp(unsigned long start, unsigned long length)
++{
++ unsigned long offset = 0;
++ unsigned long sig_len = sizeof("RSD PTR ") - 1;
++ unsigned long vstart = (unsigned long)isa_bus_to_virt(start);
++
++ /*
++ * Scan all 16-byte boundaries of the physical memory region for the
++ * RSDP signature.
++ */
++ for (offset = 0; offset < length; offset += 16) {
++ if (strncmp((char *)(vstart + offset), "RSD PTR ", sig_len))
++ continue;
++ return (start + offset);
++ }
++
++ return 0;
++}
++
++static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
++{
++ struct acpi_table_sbf *sb;
++
++ if (!phys_addr || !size)
++ return -EINVAL;
++
++ sb = (struct acpi_table_sbf *)__acpi_map_table(phys_addr, size);
++ if (!sb) {
++ printk(KERN_WARNING PREFIX "Unable to map SBF\n");
++ return -ENODEV;
++ }
++
++ sbf_port = sb->sbf_cmos; /* Save CMOS port */
++
++ return 0;
++}
++
++#ifdef CONFIG_HPET_TIMER
++
++static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
++{
++ struct acpi_table_hpet *hpet_tbl;
++
++ if (!phys || !size)
++ return -EINVAL;
++
++ hpet_tbl = (struct acpi_table_hpet *)__acpi_map_table(phys, size);
++ if (!hpet_tbl) {
++ printk(KERN_WARNING PREFIX "Unable to map HPET\n");
++ return -ENODEV;
++ }
++
++ if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) {
++ printk(KERN_WARNING PREFIX "HPET timers must be located in "
++ "memory.\n");
++ return -1;
++ }
++#ifdef CONFIG_X86_64
++ vxtime.hpet_address = hpet_tbl->addr.addrl |
++ ((long)hpet_tbl->addr.addrh << 32);
++
++ printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
++ hpet_tbl->id, vxtime.hpet_address);
++#else /* X86 */
++ {
++ extern unsigned long hpet_address;
++
++ hpet_address = hpet_tbl->addr.addrl;
++ printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
++ hpet_tbl->id, hpet_address);
++ }
++#endif /* X86 */
++
++ return 0;
++}
++#else
++#define acpi_parse_hpet NULL
++#endif
++
++#ifdef CONFIG_X86_PM_TIMER
++extern u32 pmtmr_ioport;
++#endif
++
++static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
++{
++ struct fadt_descriptor *fadt = NULL;
++
++ fadt = (struct fadt_descriptor *)__acpi_map_table(phys, size);
++ if (!fadt) {
++ printk(KERN_WARNING PREFIX "Unable to map FADT\n");
++ return 0;
++ }
++ /* initialize sci_int early for INT_SRC_OVR MADT parsing */
++ acpi_fadt.sci_int = fadt->sci_int;
++
++ /* initialize rev and apic_phys_dest_mode for x86_64 genapic */
++ acpi_fadt.revision = fadt->revision;
++ acpi_fadt.force_apic_physical_destination_mode =
++ fadt->force_apic_physical_destination_mode;
++
++#if defined(CONFIG_X86_PM_TIMER) && !defined(CONFIG_XEN)
++ /* detect the location of the ACPI PM Timer */
++ if (fadt->revision >= FADT2_REVISION_ID) {
++ /* FADT rev. 2 */
++ if (fadt->xpm_tmr_blk.address_space_id !=
++ ACPI_ADR_SPACE_SYSTEM_IO)
++ return 0;
++
++ pmtmr_ioport = fadt->xpm_tmr_blk.address;
++ /*
++ * "X" fields are optional extensions to the original V1.0
++ * fields, so we must selectively expand V1.0 fields if the
++ * corresponding X field is zero.
++ */
++ if (!pmtmr_ioport)
++ pmtmr_ioport = fadt->V1_pm_tmr_blk;
++ } else {
++ /* FADT rev. 1 */
++ pmtmr_ioport = fadt->V1_pm_tmr_blk;
++ }
++ if (pmtmr_ioport)
++ printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
++ pmtmr_ioport);
++#endif
++ return 0;
++}
++
++unsigned long __init acpi_find_rsdp(void)
++{
++ unsigned long rsdp_phys = 0;
++
++ if (efi_enabled) {
++ if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
++ return efi.acpi20;
++ else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
++ return efi.acpi;
++ }
++ /*
++ * Scan memory looking for the RSDP signature. First search EBDA (low
++ * memory) paragraphs and then search upper memory (E0000-FFFFF).
++ */
++ rsdp_phys = acpi_scan_rsdp(0, 0x400);
++ if (!rsdp_phys)
++ rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000);
++
++ return rsdp_phys;
++}
++
++#ifdef CONFIG_X86_LOCAL_APIC
++/*
++ * Parse LAPIC entries in MADT
++ * returns 0 on success, < 0 on error
++ */
++static int __init acpi_parse_madt_lapic_entries(void)
++{
++ int count;
++
++ if (!cpu_has_apic)
++ return -ENODEV;
++
++ /*
++ * Note that the LAPIC address is obtained from the MADT (32-bit value)
++ * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
++ */
++
++ count =
++ acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR,
++ acpi_parse_lapic_addr_ovr, 0);
++ if (count < 0) {
++ printk(KERN_ERR PREFIX
++ "Error parsing LAPIC address override entry\n");
++ return count;
++ }
++
++ mp_register_lapic_address(acpi_lapic_addr);
++
++ count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic,
++ MAX_APICS);
++ if (!count) {
++ printk(KERN_ERR PREFIX "No LAPIC entries present\n");
++ /* TBD: Cleanup to allow fallback to MPS */
++ return -ENODEV;
++ } else if (count < 0) {
++ printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
++ /* TBD: Cleanup to allow fallback to MPS */
++ return count;
++ }
++
++ count =
++ acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0);
++ if (count < 0) {
++ printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
++ /* TBD: Cleanup to allow fallback to MPS */
++ return count;
++ }
++ return 0;
++}
++#endif /* CONFIG_X86_LOCAL_APIC */
++
++#ifdef CONFIG_X86_IO_APIC
++/*
++ * Parse IOAPIC related entries in MADT
++ * returns 0 on success, < 0 on error
++ */
++static int __init acpi_parse_madt_ioapic_entries(void)
++{
++ int count;
++
++ /*
++ * ACPI interpreter is required to complete interrupt setup,
++ * so if it is off, don't enumerate the io-apics with ACPI.
++ * If MPS is present, it will handle them,
++ * otherwise the system will stay in PIC mode
++ */
++ if (acpi_disabled || acpi_noirq) {
++ return -ENODEV;
++ }
++
++ if (!cpu_has_apic)
++ return -ENODEV;
++
++ /*
++ * if "noapic" boot option, don't look for IO-APICs
++ */
++ if (skip_ioapic_setup) {
++ printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
++ "due to 'noapic' option.\n");
++ return -ENODEV;
++ }
++
++ count =
++ acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic,
++ MAX_IO_APICS);
++ if (!count) {
++ printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
++ return -ENODEV;
++ } else if (count < 0) {
++ printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
++ return count;
++ }
++
++ count =
++ acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr,
++ NR_IRQ_VECTORS);
++ if (count < 0) {
++ printk(KERN_ERR PREFIX
++ "Error parsing interrupt source overrides entry\n");
++ /* TBD: Cleanup to allow fallback to MPS */
++ return count;
++ }
++
++ /*
++ * If BIOS did not supply an INT_SRC_OVR for the SCI
++ * pretend we got one so we can set the SCI flags.
++ */
++ if (!acpi_sci_override_gsi)
++ acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
++
++ /* Fill in identity legacy mapings where no override */
++ mp_config_acpi_legacy_irqs();
++
++ count =
++ acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src,
++ NR_IRQ_VECTORS);
++ if (count < 0) {
++ printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
++ /* TBD: Cleanup to allow fallback to MPS */
++ return count;
++ }
++
++ return 0;
++}
++#else
++static inline int acpi_parse_madt_ioapic_entries(void)
++{
++ return -1;
++}
++#endif /* !CONFIG_X86_IO_APIC */
++
++static void __init acpi_process_madt(void)
++{
++#ifdef CONFIG_X86_LOCAL_APIC
++ int count, error;
++
++ count = acpi_table_parse(ACPI_APIC, acpi_parse_madt);
++ if (count >= 1) {
++
++ /*
++ * Parse MADT LAPIC entries
++ */
++ error = acpi_parse_madt_lapic_entries();
++ if (!error) {
++ acpi_lapic = 1;
++
++#ifdef CONFIG_X86_GENERICARCH
++ generic_bigsmp_probe();
++#endif
++ /*
++ * Parse MADT IO-APIC entries
++ */
++ error = acpi_parse_madt_ioapic_entries();
++ if (!error) {
++ acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
++ acpi_irq_balance_set(NULL);
++ acpi_ioapic = 1;
++
++ smp_found_config = 1;
++ clustered_apic_check();
++ }
++ }
++ if (error == -EINVAL) {
++ /*
++ * Dell Precision Workstation 410, 610 come here.
++ */
++ printk(KERN_ERR PREFIX
++ "Invalid BIOS MADT, disabling ACPI\n");
++ disable_acpi();
++ }
++ }
++#endif
++ return;
++}
++
++extern int acpi_force;
++
++#ifdef __i386__
++
++static int __init disable_acpi_irq(struct dmi_system_id *d)
++{
++ if (!acpi_force) {
++ printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
++ d->ident);
++ acpi_noirq_set();
++ }
++ return 0;
++}
++
++static int __init disable_acpi_pci(struct dmi_system_id *d)
++{
++ if (!acpi_force) {
++ printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
++ d->ident);
++ acpi_disable_pci();
++ }
++ return 0;
++}
++
++static int __init dmi_disable_acpi(struct dmi_system_id *d)
++{
++ if (!acpi_force) {
++ printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
++ disable_acpi();
++ } else {
++ printk(KERN_NOTICE
++ "Warning: DMI blacklist says broken, but acpi forced\n");
++ }
++ return 0;
++}
++
++/*
++ * Limit ACPI to CPU enumeration for HT
++ */
++static int __init force_acpi_ht(struct dmi_system_id *d)
++{
++ if (!acpi_force) {
++ printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
++ d->ident);
++ disable_acpi();
++ acpi_ht = 1;
++ } else {
++ printk(KERN_NOTICE
++ "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
++ }
++ return 0;
++}
++
++/*
++ * If your system is blacklisted here, but you find that acpi=force
++ * works for you, please contact acpi-devel@sourceforge.net
++ */
++static struct dmi_system_id __initdata acpi_dmi_table[] = {
++ /*
++ * Boxes that need ACPI disabled
++ */
++ {
++ .callback = dmi_disable_acpi,
++ .ident = "IBM Thinkpad",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
++ DMI_MATCH(DMI_BOARD_NAME, "2629H1G"),
++ },
++ },
++
++ /*
++ * Boxes that need acpi=ht
++ */
++ {
++ .callback = force_acpi_ht,
++ .ident = "FSC Primergy T850",
++ .matches = {
++ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
++ DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
++ },
++ },
++ {
++ .callback = force_acpi_ht,
++ .ident = "DELL GX240",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"),
++ DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"),
++ },
++ },
++ {
++ .callback = force_acpi_ht,
++ .ident = "HP VISUALIZE NT Workstation",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
++ DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
++ },
++ },
++ {
++ .callback = force_acpi_ht,
++ .ident = "Compaq Workstation W8000",
++ .matches = {
++ DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
++ DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
++ },
++ },
++ {
++ .callback = force_acpi_ht,
++ .ident = "ASUS P4B266",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
++ DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
++ },
++ },
++ {
++ .callback = force_acpi_ht,
++ .ident = "ASUS P2B-DS",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
++ DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
++ },
++ },
++ {
++ .callback = force_acpi_ht,
++ .ident = "ASUS CUR-DLS",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
++ DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
++ },
++ },
++ {
++ .callback = force_acpi_ht,
++ .ident = "ABIT i440BX-W83977",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
++ DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
++ },
++ },
++ {
++ .callback = force_acpi_ht,
++ .ident = "IBM Bladecenter",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
++ DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
++ },
++ },
++ {
++ .callback = force_acpi_ht,
++ .ident = "IBM eServer xSeries 360",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
++ DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
++ },
++ },
++ {
++ .callback = force_acpi_ht,
++ .ident = "IBM eserver xSeries 330",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
++ DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
++ },
++ },
++ {
++ .callback = force_acpi_ht,
++ .ident = "IBM eserver xSeries 440",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
++ DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
++ },
++ },
++
++ /*
++ * Boxes that need ACPI PCI IRQ routing disabled
++ */
++ {
++ .callback = disable_acpi_irq,
++ .ident = "ASUS A7V",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
++ DMI_MATCH(DMI_BOARD_NAME, "<A7V>"),
++ /* newer BIOS, Revision 1011, does work */
++ DMI_MATCH(DMI_BIOS_VERSION,
++ "ASUS A7V ACPI BIOS Revision 1007"),
++ },
++ },
++
++ /*
++ * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
++ */
++ { /* _BBN 0 bug */
++ .callback = disable_acpi_pci,
++ .ident = "ASUS PR-DLS",
++ .matches = {
++ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
++ DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"),
++ DMI_MATCH(DMI_BIOS_VERSION,
++ "ASUS PR-DLS ACPI BIOS Revision 1010"),
++ DMI_MATCH(DMI_BIOS_DATE, "03/21/2003")
++ },
++ },
++ {
++ .callback = disable_acpi_pci,
++ .ident = "Acer TravelMate 36x Laptop",
++ .matches = {
++ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
++ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
++ },
++ },
++ {}
++};
++
++#endif /* __i386__ */
++
++/*
++ * acpi_boot_table_init() and acpi_boot_init()
++ * called from setup_arch(), always.
++ * 1. checksums all tables
++ * 2. enumerates lapics
++ * 3. enumerates io-apics
++ *
++ * acpi_table_init() is separate to allow reading SRAT without
++ * other side effects.
++ *
++ * side effects of acpi_boot_init:
++ * acpi_lapic = 1 if LAPIC found
++ * acpi_ioapic = 1 if IOAPIC found
++ * if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
++ * if acpi_blacklisted() acpi_disabled = 1;
++ * acpi_irq_model=...
++ * ...
++ *
++ * return value: (currently ignored)
++ * 0: success
++ * !0: failure
++ */
++
++int __init acpi_boot_table_init(void)
++{
++ int error;
++
++#ifdef __i386__
++ dmi_check_system(acpi_dmi_table);
++#endif
++
++ /*
++ * If acpi_disabled, bail out
++ * One exception: acpi=ht continues far enough to enumerate LAPICs
++ */
++ if (acpi_disabled && !acpi_ht)
++ return 1;
++
++ /*
++ * Initialize the ACPI boot-time table parser.
++ */
++ error = acpi_table_init();
++ if (error) {
++ disable_acpi();
++ return error;
++ }
++
++ acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
++
++ /*
++ * blacklist may disable ACPI entirely
++ */
++ error = acpi_blacklisted();
++ if (error) {
++ if (acpi_force) {
++ printk(KERN_WARNING PREFIX "acpi=force override\n");
++ } else {
++ printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
++ disable_acpi();
++ return error;
++ }
++ }
++
++ return 0;
++}
++
++int __init acpi_boot_init(void)
++{
++ /*
++ * If acpi_disabled, bail out
++ * One exception: acpi=ht continues far enough to enumerate LAPICs
++ */
++ if (acpi_disabled && !acpi_ht)
++ return 1;
++
++ acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
++
++ /*
++ * set sci_int and PM timer address
++ */
++ acpi_table_parse(ACPI_FADT, acpi_parse_fadt);
++
++ /*
++ * Process the Multiple APIC Description Table (MADT), if present
++ */
++ acpi_process_madt();
++
++ acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
++
++ return 0;
++}
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/apic-xen.c 2007-08-27 14:02:03.000000000 -0400
+@@ -0,0 +1,155 @@
++/*
++ * Local APIC handling, local APIC timers
++ *
++ * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
++ *
++ * Fixes
++ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
++ * thanks to Eric Gilmore
++ * and Rolf G. Tews
++ * for testing these extensively.
++ * Maciej W. Rozycki : Various updates and fixes.
++ * Mikael Pettersson : Power Management for UP-APIC.
++ * Pavel Machek and
++ * Mikael Pettersson : PM converted to driver model.
++ */
++
++#include <linux/init.h>
++
++#include <linux/mm.h>
++#include <linux/delay.h>
++#include <linux/bootmem.h>
++#include <linux/smp_lock.h>
++#include <linux/interrupt.h>
++#include <linux/mc146818rtc.h>
++#include <linux/kernel_stat.h>
++#include <linux/sysdev.h>
++#include <linux/cpu.h>
++#include <linux/module.h>
++
++#include <asm/atomic.h>
++#include <asm/smp.h>
++#include <asm/mtrr.h>
++#include <asm/mpspec.h>
++#include <asm/desc.h>
++#include <asm/arch_hooks.h>
++#include <asm/hpet.h>
++#include <asm/i8253.h>
++#include <asm/nmi.h>
++
++#include <mach_apic.h>
++#include <mach_apicdef.h>
++#include <mach_ipi.h>
++
++#include "io_ports.h"
++
++#ifndef CONFIG_XEN
++/*
++ * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
++ * IPIs in place of local APIC timers
++ */
++static cpumask_t timer_bcast_ipi;
++#endif
++
++/*
++ * Knob to control our willingness to enable the local APIC.
++ */
++int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
++
++/*
++ * Debug level
++ */
++int apic_verbosity;
++
++#ifndef CONFIG_XEN
++static int modern_apic(void)
++{
++ unsigned int lvr, version;
++ /* AMD systems use old APIC versions, so check the CPU */
++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
++ boot_cpu_data.x86 >= 0xf)
++ return 1;
++ lvr = apic_read(APIC_LVR);
++ version = GET_APIC_VERSION(lvr);
++ return version >= 0x14;
++}
++#endif /* !CONFIG_XEN */
++
++/*
++ * 'what should we do if we get a hw irq event on an illegal vector'.
++ * each architecture has to answer this themselves.
++ */
++void ack_bad_irq(unsigned int irq)
++{
++ printk("unexpected IRQ trap at vector %02x\n", irq);
++ /*
++ * Currently unexpected vectors happen only on SMP and APIC.
++ * We _must_ ack these because every local APIC has only N
++ * irq slots per priority level, and a 'hanging, unacked' IRQ
++ * holds up an irq slot - in excessive cases (when multiple
++ * unexpected vectors occur) that might lock up the APIC
++ * completely.
++ * But only ack when the APIC is enabled -AK
++ */
++ if (cpu_has_apic)
++ ack_APIC_irq();
++}
++
++int get_physical_broadcast(void)
++{
++ return 0xff;
++}
++
++#ifndef CONFIG_XEN
++#ifndef CONFIG_SMP
++static void up_apic_timer_interrupt_call(struct pt_regs *regs)
++{
++ int cpu = smp_processor_id();
++
++ /*
++ * the NMI deadlock-detector uses this.
++ */
++ per_cpu(irq_stat, cpu).apic_timer_irqs++;
++
++ smp_local_timer_interrupt(regs);
++}
++#endif
++
++void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
++{
++ cpumask_t mask;
++
++ cpus_and(mask, cpu_online_map, timer_bcast_ipi);
++ if (!cpus_empty(mask)) {
++#ifdef CONFIG_SMP
++ send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
++#else
++ /*
++ * We can directly call the apic timer interrupt handler
++ * in UP case. Minus all irq related functions
++ */
++ up_apic_timer_interrupt_call(regs);
++#endif
++ }
++}
++#endif
++
++int setup_profiling_timer(unsigned int multiplier)
++{
++ return -EINVAL;
++}
++
++/*
++ * This initializes the IO-APIC and APIC hardware if this is
++ * a UP kernel.
++ */
++int __init APIC_init_uniprocessor (void)
++{
++#ifdef CONFIG_X86_IO_APIC
++ if (smp_found_config)
++ if (!skip_ioapic_setup && nr_ioapics)
++ setup_IO_APIC();
++#endif
++
++ return 0;
++}
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/cpu/common-xen.c 2007-08-27 14:02:03.000000000 -0400
+@@ -0,0 +1,743 @@
++#include <linux/init.h>
++#include <linux/string.h>
++#include <linux/delay.h>
++#include <linux/smp.h>
++#include <linux/module.h>
++#include <linux/percpu.h>
++#include <linux/bootmem.h>
++#include <asm/semaphore.h>
++#include <asm/processor.h>
++#include <asm/i387.h>
++#include <asm/msr.h>
++#include <asm/io.h>
++#include <asm/mmu_context.h>
++#include <asm/mtrr.h>
++#include <asm/mce.h>
++#ifdef CONFIG_X86_LOCAL_APIC
++#include <asm/mpspec.h>
++#include <asm/apic.h>
++#include <mach_apic.h>
++#else
++#ifdef CONFIG_XEN
++#define phys_pkg_id(a,b) a
++#endif
++#endif
++#include <asm/hypervisor.h>
++
++#include "cpu.h"
++
++DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
++EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
++
++#ifndef CONFIG_XEN
++DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
++EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
++#endif
++
++static int cachesize_override __cpuinitdata = -1;
++static int disable_x86_fxsr __cpuinitdata;
++static int disable_x86_serial_nr __cpuinitdata = 1;
++static int disable_x86_sep __cpuinitdata;
++
++struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
++
++extern int disable_pse;
++
++static void default_init(struct cpuinfo_x86 * c)
++{
++ /* Not much we can do here... */
++ /* Check if at least it has cpuid */
++ if (c->cpuid_level == -1) {
++ /* No cpuid. It must be an ancient CPU */
++ if (c->x86 == 4)
++ strcpy(c->x86_model_id, "486");
++ else if (c->x86 == 3)
++ strcpy(c->x86_model_id, "386");
++ }
++}
++
++static struct cpu_dev default_cpu = {
++ .c_init = default_init,
++ .c_vendor = "Unknown",
++};
++static struct cpu_dev * this_cpu = &default_cpu;
++
++static int __init cachesize_setup(char *str)
++{
++ get_option (&str, &cachesize_override);
++ return 1;
++}
++__setup("cachesize=", cachesize_setup);
++
++int __cpuinit get_model_name(struct cpuinfo_x86 *c)
++{
++ unsigned int *v;
++ char *p, *q;
++
++ if (cpuid_eax(0x80000000) < 0x80000004)
++ return 0;
++
++ v = (unsigned int *) c->x86_model_id;
++ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
++ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
++ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
++ c->x86_model_id[48] = 0;
++
++ /* Intel chips right-justify this string for some dumb reason;
++ undo that brain damage */
++ p = q = &c->x86_model_id[0];
++ while ( *p == ' ' )
++ p++;
++ if ( p != q ) {
++ while ( *p )
++ *q++ = *p++;
++ while ( q <= &c->x86_model_id[48] )
++ *q++ = '\0'; /* Zero-pad the rest */
++ }
++
++ return 1;
++}
++
++
++void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
++{
++ unsigned int n, dummy, ecx, edx, l2size;
++
++ n = cpuid_eax(0x80000000);
++
++ if (n >= 0x80000005) {
++ cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
++ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
++ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
++ c->x86_cache_size=(ecx>>24)+(edx>>24);
++ }
++
++ if (n < 0x80000006) /* Some chips just has a large L1. */
++ return;
++
++ ecx = cpuid_ecx(0x80000006);
++ l2size = ecx >> 16;
++
++ /* do processor-specific cache resizing */
++ if (this_cpu->c_size_cache)
++ l2size = this_cpu->c_size_cache(c,l2size);
++
++ /* Allow user to override all this if necessary. */
++ if (cachesize_override != -1)
++ l2size = cachesize_override;
++
++ if ( l2size == 0 )
++ return; /* Again, no L2 cache is possible */
++
++ c->x86_cache_size = l2size;
++
++ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
++ l2size, ecx & 0xFF);
++}
++
++/* Naming convention should be: <Name> [(<Codename>)] */
++/* This table only is used unless init_<vendor>() below doesn't set it; */
++/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
++
++/* Look up CPU names by table lookup. */
++static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
++{
++ struct cpu_model_info *info;
++
++ if ( c->x86_model >= 16 )
++ return NULL; /* Range check */
++
++ if (!this_cpu)
++ return NULL;
++
++ info = this_cpu->c_models;
++
++ while (info && info->family) {
++ if (info->family == c->x86)
++ return info->model_names[c->x86_model];
++ info++;
++ }
++ return NULL; /* Not found */
++}
++
++
++static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
++{
++ char *v = c->x86_vendor_id;
++ int i;
++ static int printed;
++
++ for (i = 0; i < X86_VENDOR_NUM; i++) {
++ if (cpu_devs[i]) {
++ if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
++ (cpu_devs[i]->c_ident[1] &&
++ !strcmp(v,cpu_devs[i]->c_ident[1]))) {
++ c->x86_vendor = i;
++ if (!early)
++ this_cpu = cpu_devs[i];
++ return;
++ }
++ }
++ }
++ if (!printed) {
++ printed++;
++ printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
++ printk(KERN_ERR "CPU: Your system may be unstable.\n");
++ }
++ c->x86_vendor = X86_VENDOR_UNKNOWN;
++ this_cpu = &default_cpu;
++}
++
++
++static int __init x86_fxsr_setup(char * s)
++{
++ disable_x86_fxsr = 1;
++ return 1;
++}
++__setup("nofxsr", x86_fxsr_setup);
++
++
++static int __init x86_sep_setup(char * s)
++{
++ disable_x86_sep = 1;
++ return 1;
++}
++__setup("nosep", x86_sep_setup);
++
++
++/* Standard macro to see if a specific flag is changeable */
++static inline int flag_is_changeable_p(u32 flag)
++{
++ u32 f1, f2;
++
++ asm("pushfl\n\t"
++ "pushfl\n\t"
++ "popl %0\n\t"
++ "movl %0,%1\n\t"
++ "xorl %2,%0\n\t"
++ "pushl %0\n\t"
++ "popfl\n\t"
++ "pushfl\n\t"
++ "popl %0\n\t"
++ "popfl\n\t"
++ : "=&r" (f1), "=&r" (f2)
++ : "ir" (flag));
++
++ return ((f1^f2) & flag) != 0;
++}
++
++
++/* Probe for the CPUID instruction */
++static int __cpuinit have_cpuid_p(void)
++{
++ return flag_is_changeable_p(X86_EFLAGS_ID);
++}
++
++/* Do minimum CPU detection early.
++ Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
++ The others are not touched to avoid unwanted side effects.
++
++ WARNING: this function is only called on the BP. Don't add code here
++ that is supposed to run on all CPUs. */
++static void __init early_cpu_detect(void)
++{
++ struct cpuinfo_x86 *c = &boot_cpu_data;
++
++ c->x86_cache_alignment = 32;
++
++ if (!have_cpuid_p())
++ return;
++
++ /* Get vendor name */
++ cpuid(0x00000000, &c->cpuid_level,
++ (int *)&c->x86_vendor_id[0],
++ (int *)&c->x86_vendor_id[8],
++ (int *)&c->x86_vendor_id[4]);
++
++ get_cpu_vendor(c, 1);
++
++ c->x86 = 4;
++ if (c->cpuid_level >= 0x00000001) {
++ u32 junk, tfms, cap0, misc;
++ cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
++ c->x86 = (tfms >> 8) & 15;
++ c->x86_model = (tfms >> 4) & 15;
++ if (c->x86 == 0xf)
++ c->x86 += (tfms >> 20) & 0xff;
++ if (c->x86 >= 0x6)
++ c->x86_model += ((tfms >> 16) & 0xF) << 4;
++ c->x86_mask = tfms & 15;
++ if (cap0 & (1<<19))
++ c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
++ }
++}
++
++void __cpuinit generic_identify(struct cpuinfo_x86 * c)
++{
++ u32 tfms, xlvl;
++ int ebx;
++
++ if (have_cpuid_p()) {
++ /* Get vendor name */
++ cpuid(0x00000000, &c->cpuid_level,
++ (int *)&c->x86_vendor_id[0],
++ (int *)&c->x86_vendor_id[8],
++ (int *)&c->x86_vendor_id[4]);
++
++ get_cpu_vendor(c, 0);
++ /* Initialize the standard set of capabilities */
++ /* Note that the vendor-specific code below might override */
++
++ /* Intel-defined flags: level 0x00000001 */
++ if ( c->cpuid_level >= 0x00000001 ) {
++ u32 capability, excap;
++ cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
++ c->x86_capability[0] = capability;
++ c->x86_capability[4] = excap;
++ c->x86 = (tfms >> 8) & 15;
++ c->x86_model = (tfms >> 4) & 15;
++ if (c->x86 == 0xf)
++ c->x86 += (tfms >> 20) & 0xff;
++ if (c->x86 >= 0x6)
++ c->x86_model += ((tfms >> 16) & 0xF) << 4;
++ c->x86_mask = tfms & 15;
++#ifdef CONFIG_X86_HT
++ c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
++#else
++ c->apicid = (ebx >> 24) & 0xFF;
++#endif
++ } else {
++ /* Have CPUID level 0 only - unheard of */
++ c->x86 = 4;
++ }
++
++ /* AMD-defined flags: level 0x80000001 */
++ xlvl = cpuid_eax(0x80000000);
++ if ( (xlvl & 0xffff0000) == 0x80000000 ) {
++ if ( xlvl >= 0x80000001 ) {
++ c->x86_capability[1] = cpuid_edx(0x80000001);
++ c->x86_capability[6] = cpuid_ecx(0x80000001);
++ }
++ if ( xlvl >= 0x80000004 )
++ get_model_name(c); /* Default name */
++ }
++ }
++
++ early_intel_workaround(c);
++
++#ifdef CONFIG_X86_HT
++ c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
++#endif
++}
++
++static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
++{
++ if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
++ /* Disable processor serial number */
++ unsigned long lo,hi;
++ rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
++ lo |= 0x200000;
++ wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
++ printk(KERN_NOTICE "CPU serial number disabled.\n");
++ clear_bit(X86_FEATURE_PN, c->x86_capability);
++
++ /* Disabling the serial number may affect the cpuid level */
++ c->cpuid_level = cpuid_eax(0);
++ }
++}
++
++static int __init x86_serial_nr_setup(char *s)
++{
++ disable_x86_serial_nr = 0;
++ return 1;
++}
++__setup("serialnumber", x86_serial_nr_setup);
++
++
++
++/*
++ * This does the hard work of actually picking apart the CPU stuff...
++ */
++void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
++{
++ int i;
++
++ c->loops_per_jiffy = loops_per_jiffy;
++ c->x86_cache_size = -1;
++ c->x86_vendor = X86_VENDOR_UNKNOWN;
++ c->cpuid_level = -1; /* CPUID not detected */
++ c->x86_model = c->x86_mask = 0; /* So far unknown... */
++ c->x86_vendor_id[0] = '\0'; /* Unset */
++ c->x86_model_id[0] = '\0'; /* Unset */
++ c->x86_max_cores = 1;
++ memset(&c->x86_capability, 0, sizeof c->x86_capability);
++
++ if (!have_cpuid_p()) {
++ /* First of all, decide if this is a 486 or higher */
++ /* It's a 486 if we can modify the AC flag */
++ if ( flag_is_changeable_p(X86_EFLAGS_AC) )
++ c->x86 = 4;
++ else
++ c->x86 = 3;
++ }
++
++ generic_identify(c);
++
++ printk(KERN_DEBUG "CPU: After generic identify, caps:");
++ for (i = 0; i < NCAPINTS; i++)
++ printk(" %08lx", c->x86_capability[i]);
++ printk("\n");
++
++ if (this_cpu->c_identify) {
++ this_cpu->c_identify(c);
++
++ printk(KERN_DEBUG "CPU: After vendor identify, caps:");
++ for (i = 0; i < NCAPINTS; i++)
++ printk(" %08lx", c->x86_capability[i]);
++ printk("\n");
++ }
++
++ /*
++ * Vendor-specific initialization. In this section we
++ * canonicalize the feature flags, meaning if there are
++ * features a certain CPU supports which CPUID doesn't
++ * tell us, CPUID claiming incorrect flags, or other bugs,
++ * we handle them here.
++ *
++ * At the end of this section, c->x86_capability better
++ * indicate the features this CPU genuinely supports!
++ */
++ if (this_cpu->c_init)
++ this_cpu->c_init(c);
++
++ /* Disable the PN if appropriate */
++ squash_the_stupid_serial_number(c);
++
++ /*
++ * The vendor-specific functions might have changed features. Now
++ * we do "generic changes."
++ */
++
++ /* TSC disabled? */
++ if ( tsc_disable )
++ clear_bit(X86_FEATURE_TSC, c->x86_capability);
++
++ /* FXSR disabled? */
++ if (disable_x86_fxsr) {
++ clear_bit(X86_FEATURE_FXSR, c->x86_capability);
++ clear_bit(X86_FEATURE_XMM, c->x86_capability);
++ }
++
++ /* SEP disabled? */
++ if (disable_x86_sep)
++ clear_bit(X86_FEATURE_SEP, c->x86_capability);
++
++ if (disable_pse)
++ clear_bit(X86_FEATURE_PSE, c->x86_capability);
++
++ /* If the model name is still unset, do table lookup. */
++ if ( !c->x86_model_id[0] ) {
++ char *p;
++ p = table_lookup_model(c);
++ if ( p )
++ strcpy(c->x86_model_id, p);
++ else
++ /* Last resort... */
++ sprintf(c->x86_model_id, "%02x/%02x",
++ c->x86, c->x86_model);
++ }
++
++ /* Now the feature flags better reflect actual CPU features! */
++
++ printk(KERN_DEBUG "CPU: After all inits, caps:");
++ for (i = 0; i < NCAPINTS; i++)
++ printk(" %08lx", c->x86_capability[i]);
++ printk("\n");
++
++ /*
++ * On SMP, boot_cpu_data holds the common feature set between
++ * all CPUs; so make sure that we indicate which features are
++ * common between the CPUs. The first time this routine gets
++ * executed, c == &boot_cpu_data.
++ */
++ if ( c != &boot_cpu_data ) {
++ /* AND the already accumulated flags with these */
++ for ( i = 0 ; i < NCAPINTS ; i++ )
++ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
++ }
++
++ /* Init Machine Check Exception if available. */
++ mcheck_init(c);
++
++ if (c == &boot_cpu_data)
++ sysenter_setup();
++ enable_sep_cpu();
++
++ if (c == &boot_cpu_data)
++ mtrr_bp_init();
++ else
++ mtrr_ap_init();
++}
++
++#ifdef CONFIG_X86_HT
++void __cpuinit detect_ht(struct cpuinfo_x86 *c)
++{
++ u32 eax, ebx, ecx, edx;
++ int index_msb, core_bits;
++
++ cpuid(1, &eax, &ebx, &ecx, &edx);
++
++ if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
++ return;
++
++ smp_num_siblings = (ebx & 0xff0000) >> 16;
++
++ if (smp_num_siblings == 1) {
++ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
++ } else if (smp_num_siblings > 1 ) {
++
++ if (smp_num_siblings > NR_CPUS) {
++ printk(KERN_WARNING "CPU: Unsupported number of the "
++ "siblings %d", smp_num_siblings);
++ smp_num_siblings = 1;
++ return;
++ }
++
++ index_msb = get_count_order(smp_num_siblings);
++ c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
++
++ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
++ c->phys_proc_id);
++
++ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
++
++ index_msb = get_count_order(smp_num_siblings) ;
++
++ core_bits = get_count_order(c->x86_max_cores);
++
++ c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
++ ((1 << core_bits) - 1);
++
++ if (c->x86_max_cores > 1)
++ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
++ c->cpu_core_id);
++ }
++}
++#endif
++
++void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
++{
++ char *vendor = NULL;
++
++ if (c->x86_vendor < X86_VENDOR_NUM)
++ vendor = this_cpu->c_vendor;
++ else if (c->cpuid_level >= 0)
++ vendor = c->x86_vendor_id;
++
++ if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
++ printk("%s ", vendor);
++
++ if (!c->x86_model_id[0])
++ printk("%d86", c->x86);
++ else
++ printk("%s", c->x86_model_id);
++
++ if (c->x86_mask || c->cpuid_level >= 0)
++ printk(" stepping %02x\n", c->x86_mask);
++ else
++ printk("\n");
++}
++
++cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
++
++/* This is hacky. :)
++ * We're emulating future behavior.
++ * In the future, the cpu-specific init functions will be called implicitly
++ * via the magic of initcalls.
++ * They will insert themselves into the cpu_devs structure.
++ * Then, when cpu_init() is called, we can just iterate over that array.
++ */
++
++extern int intel_cpu_init(void);
++extern int cyrix_init_cpu(void);
++extern int nsc_init_cpu(void);
++extern int amd_init_cpu(void);
++extern int centaur_init_cpu(void);
++extern int transmeta_init_cpu(void);
++extern int rise_init_cpu(void);
++extern int nexgen_init_cpu(void);
++extern int umc_init_cpu(void);
++
++void __init early_cpu_init(void)
++{
++ intel_cpu_init();
++ cyrix_init_cpu();
++ nsc_init_cpu();
++ amd_init_cpu();
++ centaur_init_cpu();
++ transmeta_init_cpu();
++ rise_init_cpu();
++ nexgen_init_cpu();
++ umc_init_cpu();
++ early_cpu_detect();
++
++#ifdef CONFIG_DEBUG_PAGEALLOC
++ /* pse is not compatible with on-the-fly unmapping,
++ * disable it even if the cpus claim to support it.
++ */
++ clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
++ disable_pse = 1;
++#endif
++}
++
++void __cpuinit cpu_gdt_init(struct Xgt_desc_struct *gdt_descr)
++{
++ unsigned long frames[16];
++ unsigned long va;
++ int f;
++
++ for (va = gdt_descr->address, f = 0;
++ va < gdt_descr->address + gdt_descr->size;
++ va += PAGE_SIZE, f++) {
++ frames[f] = virt_to_mfn(va);
++ make_lowmem_page_readonly(
++ (void *)va, XENFEAT_writable_descriptor_tables);
++ }
++ if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8))
++ BUG();
++}
++
++/*
++ * cpu_init() initializes state that is per-CPU. Some data is already
++ * initialized (naturally) in the bootstrap process, such as the GDT
++ * and IDT. We reload them nevertheless, this function acts as a
++ * 'CPU state barrier', nothing should get across.
++ */
++void __cpuinit cpu_init(void)
++{
++ int cpu = smp_processor_id();
++#ifndef CONFIG_X86_NO_TSS
++ struct tss_struct * t = &per_cpu(init_tss, cpu);
++#endif
++ struct thread_struct *thread = &current->thread;
++ struct desc_struct *gdt;
++ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
++
++ if (cpu_test_and_set(cpu, cpu_initialized)) {
++ printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
++ for (;;) local_irq_enable();
++ }
++ printk(KERN_INFO "Initializing CPU#%d\n", cpu);
++
++ if (cpu_has_vme || cpu_has_de)
++ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
++ if (tsc_disable && cpu_has_tsc) {
++ printk(KERN_NOTICE "Disabling TSC...\n");
++ /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
++ clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
++ set_in_cr4(X86_CR4_TSD);
++ }
++
++#ifndef CONFIG_XEN
++ /* The CPU hotplug case */
++ if (cpu_gdt_descr->address) {
++ gdt = (struct desc_struct *)cpu_gdt_descr->address;
++ memset(gdt, 0, PAGE_SIZE);
++ goto old_gdt;
++ }
++ /*
++ * This is a horrible hack to allocate the GDT. The problem
++ * is that cpu_init() is called really early for the boot CPU
++ * (and hence needs bootmem) but much later for the secondary
++ * CPUs, when bootmem will have gone away
++ */
++ if (NODE_DATA(0)->bdata->node_bootmem_map) {
++ gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
++ /* alloc_bootmem_pages panics on failure, so no check */
++ memset(gdt, 0, PAGE_SIZE);
++ } else {
++ gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
++ if (unlikely(!gdt)) {
++ printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
++ for (;;)
++ local_irq_enable();
++ }
++ }
++old_gdt:
++ /*
++ * Initialize the per-CPU GDT with the boot GDT,
++ * and set up the GDT descriptor:
++ */
++ memcpy(gdt, cpu_gdt_table, GDT_SIZE);
++
++ /* Set up GDT entry for 16bit stack */
++ *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
++ ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
++ ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
++ (CPU_16BIT_STACK_SIZE - 1);
++
++ cpu_gdt_descr->size = GDT_SIZE - 1;
++ cpu_gdt_descr->address = (unsigned long)gdt;
++#else
++ if (cpu == 0 && cpu_gdt_descr->address == 0) {
++ gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
++ /* alloc_bootmem_pages panics on failure, so no check */
++ memset(gdt, 0, PAGE_SIZE);
++
++ memcpy(gdt, cpu_gdt_table, GDT_SIZE);
++
++ cpu_gdt_descr->size = GDT_SIZE;
++ cpu_gdt_descr->address = (unsigned long)gdt;
++ }
++#endif
++
++ cpu_gdt_init(cpu_gdt_descr);
++
++ /*
++ * Set up and load the per-CPU TSS and LDT
++ */
++ atomic_inc(&init_mm.mm_count);
++ current->active_mm = &init_mm;
++ if (current->mm)
++ BUG();
++ enter_lazy_tlb(&init_mm, current);
++
++ load_esp0(t, thread);
++
++ load_LDT(&init_mm.context);
++
++#ifdef CONFIG_DOUBLEFAULT
++ /* Set up doublefault TSS pointer in the GDT */
++ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
++#endif
++
++ /* Clear %fs and %gs. */
++ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
++
++ /* Clear all 6 debug registers: */
++ set_debugreg(0, 0);
++ set_debugreg(0, 1);
++ set_debugreg(0, 2);
++ set_debugreg(0, 3);
++ set_debugreg(0, 6);
++ set_debugreg(0, 7);
++
++ /*
++ * Force FPU initialization:
++ */
++ current_thread_info()->status = 0;
++ clear_used_math();
++ mxcsr_feature_mask_init();
++}
++
++#ifdef CONFIG_HOTPLUG_CPU
++void __cpuinit cpu_uninit(void)
++{
++ int cpu = raw_smp_processor_id();
++ cpu_clear(cpu, cpu_initialized);
++
++ /* lazy TLB state */
++ per_cpu(cpu_tlbstate, cpu).state = 0;
++ per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
++}
++#endif
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/cpu/mtrr/main-xen.c 2007-08-27 14:02:01.000000000 -0400
+@@ -0,0 +1,197 @@
++#include <linux/init.h>
++#include <linux/proc_fs.h>
++#include <linux/ctype.h>
++#include <linux/module.h>
++#include <linux/seq_file.h>
++#include <asm/uaccess.h>
++#include <linux/mutex.h>
++
++#include <asm/mtrr.h>
++#include "mtrr.h"
++
++static DEFINE_MUTEX(mtrr_mutex);
++
++void generic_get_mtrr(unsigned int reg, unsigned long *base,
++ unsigned int *size, mtrr_type * type)
++{
++ struct xen_platform_op op;
++
++ op.cmd = XENPF_read_memtype;
++ op.u.read_memtype.reg = reg;
++ (void)HYPERVISOR_platform_op(&op);
++
++ *size = op.u.read_memtype.nr_mfns;
++ *base = op.u.read_memtype.mfn;
++ *type = op.u.read_memtype.type;
++}
++
++struct mtrr_ops generic_mtrr_ops = {
++ .use_intel_if = 1,
++ .get = generic_get_mtrr,
++};
++
++struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
++unsigned int num_var_ranges;
++unsigned int *usage_table;
++
++static void __init set_num_var_ranges(void)
++{
++ struct xen_platform_op op;
++
++ for (num_var_ranges = 0; ; num_var_ranges++) {
++ op.cmd = XENPF_read_memtype;
++ op.u.read_memtype.reg = num_var_ranges;
++ if (HYPERVISOR_platform_op(&op) != 0)
++ break;
++ }
++}
++
++static void __init init_table(void)
++{
++ int i, max;
++
++ max = num_var_ranges;
++ if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
++ == NULL) {
++ printk(KERN_ERR "mtrr: could not allocate\n");
++ return;
++ }
++ for (i = 0; i < max; i++)
++ usage_table[i] = 0;
++}
++
++int mtrr_add_page(unsigned long base, unsigned long size,
++ unsigned int type, char increment)
++{
++ int error;
++ struct xen_platform_op op;
++
++ mutex_lock(&mtrr_mutex);
++
++ op.cmd = XENPF_add_memtype;
++ op.u.add_memtype.mfn = base;
++ op.u.add_memtype.nr_mfns = size;
++ op.u.add_memtype.type = type;
++ error = HYPERVISOR_platform_op(&op);
++ if (error) {
++ mutex_unlock(&mtrr_mutex);
++ BUG_ON(error > 0);
++ return error;
++ }
++
++ if (increment)
++ ++usage_table[op.u.add_memtype.reg];
++
++ mutex_unlock(&mtrr_mutex);
++
++ return op.u.add_memtype.reg;
++}
++
++static int mtrr_check(unsigned long base, unsigned long size)
++{
++ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
++ printk(KERN_WARNING
++ "mtrr: size and base must be multiples of 4 kiB\n");
++ printk(KERN_DEBUG
++ "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
++ dump_stack();
++ return -1;
++ }
++ return 0;
++}
++
++int
++mtrr_add(unsigned long base, unsigned long size, unsigned int type,
++ char increment)
++{
++ if (mtrr_check(base, size))
++ return -EINVAL;
++ return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
++ increment);
++}
++
++int mtrr_del_page(int reg, unsigned long base, unsigned long size)
++{
++ unsigned i;
++ mtrr_type ltype;
++ unsigned long lbase;
++ unsigned int lsize;
++ int error = -EINVAL;
++ struct xen_platform_op op;
++
++ mutex_lock(&mtrr_mutex);
++
++ if (reg < 0) {
++ /* Search for existing MTRR */
++ for (i = 0; i < num_var_ranges; ++i) {
++ mtrr_if->get(i, &lbase, &lsize, &ltype);
++ if (lbase == base && lsize == size) {
++ reg = i;
++ break;
++ }
++ }
++ if (reg < 0) {
++ printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
++ size);
++ goto out;
++ }
++ }
++ if (usage_table[reg] < 1) {
++ printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
++ goto out;
++ }
++ if (--usage_table[reg] < 1) {
++ op.cmd = XENPF_del_memtype;
++ op.u.del_memtype.handle = 0;
++ op.u.del_memtype.reg = reg;
++ error = HYPERVISOR_platform_op(&op);
++ if (error) {
++ BUG_ON(error > 0);
++ goto out;
++ }
++ }
++ error = reg;
++ out:
++ mutex_unlock(&mtrr_mutex);
++ return error;
++}
++
++int
++mtrr_del(int reg, unsigned long base, unsigned long size)
++{
++ if (mtrr_check(base, size))
++ return -EINVAL;
++ return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
++}
++
++EXPORT_SYMBOL(mtrr_add);
++EXPORT_SYMBOL(mtrr_del);
++
++void __init mtrr_bp_init(void)
++{
++}
++
++void mtrr_ap_init(void)
++{
++}
++
++static int __init mtrr_init(void)
++{
++ struct cpuinfo_x86 *c = &boot_cpu_data;
++
++ if (!is_initial_xendomain())
++ return -ENODEV;
++
++ if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
++ (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
++ (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
++ (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
++ return -ENODEV;
++
++ set_num_var_ranges();
++ init_table();
++
++ return 0;
++}
++
++subsys_initcall(mtrr_init);
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/early_printk-xen.c 2007-08-27 14:01:24.000000000 -0400
+@@ -0,0 +1,2 @@
++
++#include "../../x86_64/kernel/early_printk-xen.c"
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/entry-xen.S 2007-08-27 14:02:03.000000000 -0400
+@@ -0,0 +1,1216 @@
++/*
++ * linux/arch/i386/entry.S
++ *
++ * Copyright (C) 1991, 1992 Linus Torvalds
++ */
++
++/*
++ * entry.S contains the system-call and fault low-level handling routines.
++ * This also contains the timer-interrupt handler, as well as all interrupts
++ * and faults that can result in a task-switch.
++ *
++ * NOTE: This code handles signal-recognition, which happens every time
++ * after a timer-interrupt and after each system call.
++ *
++ * I changed all the .align's to 4 (16 byte alignment), as that's faster
++ * on a 486.
++ *
++ * Stack layout in 'ret_from_system_call':
++ * ptrace needs to have all regs on the stack.
++ * if the order here is changed, it needs to be
++ * updated in fork.c:copy_process, signal.c:do_signal,
++ * ptrace.c and ptrace.h
++ *
++ * 0(%esp) - %ebx
++ * 4(%esp) - %ecx
++ * 8(%esp) - %edx
++ * C(%esp) - %esi
++ * 10(%esp) - %edi
++ * 14(%esp) - %ebp
++ * 18(%esp) - %eax
++ * 1C(%esp) - %ds
++ * 20(%esp) - %es
++ * 24(%esp) - orig_eax
++ * 28(%esp) - %eip
++ * 2C(%esp) - %cs
++ * 30(%esp) - %eflags
++ * 34(%esp) - %oldesp
++ * 38(%esp) - %oldss
++ *
++ * "current" is in register %ebx during any slow entries.
++ */
++
++#include <linux/linkage.h>
++#include <asm/thread_info.h>
++#include <asm/irqflags.h>
++#include <asm/errno.h>
++#include <asm/segment.h>
++#include <asm/smp.h>
++#include <asm/page.h>
++#include <asm/desc.h>
++#include <asm/dwarf2.h>
++#include "irq_vectors.h"
++#include <xen/interface/xen.h>
++
++#define nr_syscalls ((syscall_table_size)/4)
++
++EBX = 0x00
++ECX = 0x04
++EDX = 0x08
++ESI = 0x0C
++EDI = 0x10
++EBP = 0x14
++EAX = 0x18
++DS = 0x1C
++ES = 0x20
++ORIG_EAX = 0x24
++EIP = 0x28
++CS = 0x2C
++EFLAGS = 0x30
++OLDESP = 0x34
++OLDSS = 0x38
++
++CF_MASK = 0x00000001
++TF_MASK = 0x00000100
++IF_MASK = 0x00000200
++DF_MASK = 0x00000400
++NT_MASK = 0x00004000
++VM_MASK = 0x00020000
++/* Pseudo-eflags. */
++NMI_MASK = 0x80000000
++
++#ifndef CONFIG_XEN
++#define DISABLE_INTERRUPTS cli
++#define ENABLE_INTERRUPTS sti
++#else
++/* Offsets into shared_info_t. */
++#define evtchn_upcall_pending /* 0 */
++#define evtchn_upcall_mask 1
++
++#define sizeof_vcpu_shift 6
++
++#ifdef CONFIG_SMP
++#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
++ shl $sizeof_vcpu_shift,%esi ; \
++ addl HYPERVISOR_shared_info,%esi
++#else
++#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
++#endif
++
++#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
++#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
++#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
++ __DISABLE_INTERRUPTS
++#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
++ __ENABLE_INTERRUPTS
++#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
++#endif
++
++#ifdef CONFIG_PREEMPT
++#define preempt_stop cli; TRACE_IRQS_OFF
++#else
++#define preempt_stop
++#define resume_kernel restore_nocheck
++#endif
++
++.macro TRACE_IRQS_IRET
++#ifdef CONFIG_TRACE_IRQFLAGS
++ testl $IF_MASK,EFLAGS(%esp) # interrupts off?
++ jz 1f
++ TRACE_IRQS_ON
++1:
++#endif
++.endm
++
++#ifdef CONFIG_VM86
++#define resume_userspace_sig check_userspace
++#else
++#define resume_userspace_sig resume_userspace
++#endif
++
++#define SAVE_ALL \
++ cld; \
++ pushl %es; \
++ CFI_ADJUST_CFA_OFFSET 4;\
++ /*CFI_REL_OFFSET es, 0;*/\
++ pushl %ds; \
++ CFI_ADJUST_CFA_OFFSET 4;\
++ /*CFI_REL_OFFSET ds, 0;*/\
++ pushl %eax; \
++ CFI_ADJUST_CFA_OFFSET 4;\
++ CFI_REL_OFFSET eax, 0;\
++ pushl %ebp; \
++ CFI_ADJUST_CFA_OFFSET 4;\
++ CFI_REL_OFFSET ebp, 0;\
++ pushl %edi; \
++ CFI_ADJUST_CFA_OFFSET 4;\
++ CFI_REL_OFFSET edi, 0;\
++ pushl %esi; \
++ CFI_ADJUST_CFA_OFFSET 4;\
++ CFI_REL_OFFSET esi, 0;\
++ pushl %edx; \
++ CFI_ADJUST_CFA_OFFSET 4;\
++ CFI_REL_OFFSET edx, 0;\
++ pushl %ecx; \
++ CFI_ADJUST_CFA_OFFSET 4;\
++ CFI_REL_OFFSET ecx, 0;\
++ pushl %ebx; \
++ CFI_ADJUST_CFA_OFFSET 4;\
++ CFI_REL_OFFSET ebx, 0;\
++ movl $(__USER_DS), %edx; \
++ movl %edx, %ds; \
++ movl %edx, %es;
++
++#define RESTORE_INT_REGS \
++ popl %ebx; \
++ CFI_ADJUST_CFA_OFFSET -4;\
++ CFI_RESTORE ebx;\
++ popl %ecx; \
++ CFI_ADJUST_CFA_OFFSET -4;\
++ CFI_RESTORE ecx;\
++ popl %edx; \
++ CFI_ADJUST_CFA_OFFSET -4;\
++ CFI_RESTORE edx;\
++ popl %esi; \
++ CFI_ADJUST_CFA_OFFSET -4;\
++ CFI_RESTORE esi;\
++ popl %edi; \
++ CFI_ADJUST_CFA_OFFSET -4;\
++ CFI_RESTORE edi;\
++ popl %ebp; \
++ CFI_ADJUST_CFA_OFFSET -4;\
++ CFI_RESTORE ebp;\
++ popl %eax; \
++ CFI_ADJUST_CFA_OFFSET -4;\
++ CFI_RESTORE eax
++
++#define RESTORE_REGS \
++ RESTORE_INT_REGS; \
++1: popl %ds; \
++ CFI_ADJUST_CFA_OFFSET -4;\
++ /*CFI_RESTORE ds;*/\
++2: popl %es; \
++ CFI_ADJUST_CFA_OFFSET -4;\
++ /*CFI_RESTORE es;*/\
++.section .fixup,"ax"; \
++3: movl $0,(%esp); \
++ jmp 1b; \
++4: movl $0,(%esp); \
++ jmp 2b; \
++.previous; \
++.section __ex_table,"a";\
++ .align 4; \
++ .long 1b,3b; \
++ .long 2b,4b; \
++.previous
++
++#define RING0_INT_FRAME \
++ CFI_STARTPROC simple;\
++ CFI_DEF_CFA esp, 3*4;\
++ /*CFI_OFFSET cs, -2*4;*/\
++ CFI_OFFSET eip, -3*4
++
++#define RING0_EC_FRAME \
++ CFI_STARTPROC simple;\
++ CFI_DEF_CFA esp, 4*4;\
++ /*CFI_OFFSET cs, -2*4;*/\
++ CFI_OFFSET eip, -3*4
++
++#define RING0_PTREGS_FRAME \
++ CFI_STARTPROC simple;\
++ CFI_DEF_CFA esp, OLDESP-EBX;\
++ /*CFI_OFFSET cs, CS-OLDESP;*/\
++ CFI_OFFSET eip, EIP-OLDESP;\
++ /*CFI_OFFSET es, ES-OLDESP;*/\
++ /*CFI_OFFSET ds, DS-OLDESP;*/\
++ CFI_OFFSET eax, EAX-OLDESP;\
++ CFI_OFFSET ebp, EBP-OLDESP;\
++ CFI_OFFSET edi, EDI-OLDESP;\
++ CFI_OFFSET esi, ESI-OLDESP;\
++ CFI_OFFSET edx, EDX-OLDESP;\
++ CFI_OFFSET ecx, ECX-OLDESP;\
++ CFI_OFFSET ebx, EBX-OLDESP
++
++ENTRY(ret_from_fork)
++ CFI_STARTPROC
++ pushl %eax
++ CFI_ADJUST_CFA_OFFSET 4
++ call schedule_tail
++ GET_THREAD_INFO(%ebp)
++ popl %eax
++ CFI_ADJUST_CFA_OFFSET -4
++ pushl $0x0202 # Reset kernel eflags
++ CFI_ADJUST_CFA_OFFSET 4
++ popfl
++ CFI_ADJUST_CFA_OFFSET -4
++ jmp syscall_exit
++ CFI_ENDPROC
++
++/*
++ * Return to user mode is not as complex as all this looks,
++ * but we want the default path for a system call return to
++ * go as quickly as possible which is why some of this is
++ * less clear than it otherwise should be.
++ */
++
++ # userspace resumption stub bypassing syscall exit tracing
++ ALIGN
++ RING0_PTREGS_FRAME
++ret_from_exception:
++ preempt_stop
++ret_from_intr:
++ GET_THREAD_INFO(%ebp)
++check_userspace:
++ movl EFLAGS(%esp), %eax # mix EFLAGS and CS
++ movb CS(%esp), %al
++ testl $(VM_MASK | 2), %eax
++ jz resume_kernel
++ENTRY(resume_userspace)
++ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
++ # setting need_resched or sigpending
++ # between sampling and the iret
++ movl TI_flags(%ebp), %ecx
++ andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
++ # int/exception return?
++ jne work_pending
++ jmp restore_all
++
++#ifdef CONFIG_PREEMPT
++ENTRY(resume_kernel)
++ cli
++ cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
++ jnz restore_nocheck
++need_resched:
++ movl TI_flags(%ebp), %ecx # need_resched set ?
++ testb $_TIF_NEED_RESCHED, %cl
++ jz restore_all
++ testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ?
++ jz restore_all
++ call preempt_schedule_irq
++ jmp need_resched
++#endif
++ CFI_ENDPROC
++
++/* SYSENTER_RETURN points to after the "sysenter" instruction in
++ the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
++
++ # sysenter call handler stub
++ENTRY(sysenter_entry)
++ CFI_STARTPROC simple
++ CFI_DEF_CFA esp, 0
++ CFI_REGISTER esp, ebp
++ movl SYSENTER_stack_esp0(%esp),%esp
++sysenter_past_esp:
++ /*
++ * No need to follow this irqs on/off section: the syscall
++ * disabled irqs and here we enable it straight after entry:
++ */
++ sti
++ pushl $(__USER_DS)
++ CFI_ADJUST_CFA_OFFSET 4
++ /*CFI_REL_OFFSET ss, 0*/
++ pushl %ebp
++ CFI_ADJUST_CFA_OFFSET 4
++ CFI_REL_OFFSET esp, 0
++ pushfl
++ CFI_ADJUST_CFA_OFFSET 4
++ pushl $(__USER_CS)
++ CFI_ADJUST_CFA_OFFSET 4
++ /*CFI_REL_OFFSET cs, 0*/
++ /*
++ * Push current_thread_info()->sysenter_return to the stack.
++ * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
++ * pushed above; +8 corresponds to copy_thread's esp0 setting.
++ */
++ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
++ CFI_ADJUST_CFA_OFFSET 4
++ CFI_REL_OFFSET eip, 0
++
++/*
++ * Load the potential sixth argument from user stack.
++ * Careful about security.
++ */
++ cmpl $__PAGE_OFFSET-3,%ebp
++ jae syscall_fault
++1: movl (%ebp),%ebp
++.section __ex_table,"a"
++ .align 4
++ .long 1b,syscall_fault
++.previous
++
++ pushl %eax
++ CFI_ADJUST_CFA_OFFSET 4
++ SAVE_ALL
++ GET_THREAD_INFO(%ebp)
++
++ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
++ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
++ jnz syscall_trace_entry
++ cmpl $(nr_syscalls), %eax
++ jae syscall_badsys
++ call *sys_call_table(,%eax,4)
++ movl %eax,EAX(%esp)
++ DISABLE_INTERRUPTS
++ TRACE_IRQS_OFF
++ movl TI_flags(%ebp), %ecx
++ testw $_TIF_ALLWORK_MASK, %cx
++ jne syscall_exit_work
++/* if something modifies registers it must also disable sysexit */
++ movl EIP(%esp), %edx
++ movl OLDESP(%esp), %ecx
++ xorl %ebp,%ebp
++#ifdef CONFIG_XEN
++ TRACE_IRQS_ON
++ __ENABLE_INTERRUPTS
++sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/
++ __TEST_PENDING
++ jnz 14f # process more events if necessary...
++ movl ESI(%esp), %esi
++ sysexit
++14: __DISABLE_INTERRUPTS
++ TRACE_IRQS_OFF
++sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/
++ push %esp
++ call evtchn_do_upcall
++ add $4,%esp
++ jmp ret_from_intr
++#else
++ TRACE_IRQS_ON
++ sti
++ sysexit
++#endif /* !CONFIG_XEN */
++ CFI_ENDPROC
++
++
++ # system call handler stub
++ENTRY(system_call)
++ RING0_INT_FRAME # can't unwind into user space anyway
++ pushl %eax # save orig_eax
++ CFI_ADJUST_CFA_OFFSET 4
++ SAVE_ALL
++ GET_THREAD_INFO(%ebp)
++ testl $TF_MASK,EFLAGS(%esp)
++ jz no_singlestep
++ orl $_TIF_SINGLESTEP,TI_flags(%ebp)
++no_singlestep:
++ # system call tracing in operation / emulation
++ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
++ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
++ jnz syscall_trace_entry
++ cmpl $(nr_syscalls), %eax
++ jae syscall_badsys
++syscall_call:
++ call *sys_call_table(,%eax,4)
++ movl %eax,EAX(%esp) # store the return value
++syscall_exit:
++ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
++ # setting need_resched or sigpending
++ # between sampling and the iret
++ TRACE_IRQS_OFF
++ movl TI_flags(%ebp), %ecx
++ testw $_TIF_ALLWORK_MASK, %cx # current->work
++ jne syscall_exit_work
++
++restore_all:
++#ifndef CONFIG_XEN
++ movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
++ # Warning: OLDSS(%esp) contains the wrong/random values if we
++ # are returning to the kernel.
++ # See comments in process.c:copy_thread() for details.
++ movb OLDSS(%esp), %ah
++ movb CS(%esp), %al
++ andl $(VM_MASK | (4 << 8) | 3), %eax
++ cmpl $((4 << 8) | 3), %eax
++ CFI_REMEMBER_STATE
++ je ldt_ss # returning to user-space with LDT SS
++restore_nocheck:
++#else
++restore_nocheck:
++ movl EFLAGS(%esp), %eax
++ testl $(VM_MASK|NMI_MASK), %eax
++ CFI_REMEMBER_STATE
++ jnz hypervisor_iret
++ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
++ GET_VCPU_INFO
++ andb evtchn_upcall_mask(%esi),%al
++ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
++ CFI_REMEMBER_STATE
++ jnz restore_all_enable_events # != 0 => enable event delivery
++#endif
++ TRACE_IRQS_IRET
++restore_nocheck_notrace:
++ RESTORE_REGS
++ addl $4, %esp
++ CFI_ADJUST_CFA_OFFSET -4
++1: iret
++.section .fixup,"ax"
++iret_exc:
++#ifndef CONFIG_XEN
++ TRACE_IRQS_ON
++ sti
++#endif
++ pushl $0 # no error code
++ pushl $do_iret_error
++ jmp error_code
++.previous
++.section __ex_table,"a"
++ .align 4
++ .long 1b,iret_exc
++.previous
++
++ CFI_RESTORE_STATE
++#ifndef CONFIG_XEN
++ldt_ss:
++ larl OLDSS(%esp), %eax
++ jnz restore_nocheck
++ testl $0x00400000, %eax # returning to 32bit stack?
++ jnz restore_nocheck # allright, normal return
++ /* If returning to userspace with 16bit stack,
++ * try to fix the higher word of ESP, as the CPU
++ * won't restore it.
++ * This is an "official" bug of all the x86-compatible
++ * CPUs, which we can try to work around to make
++ * dosemu and wine happy. */
++ subl $8, %esp # reserve space for switch16 pointer
++ CFI_ADJUST_CFA_OFFSET 8
++ cli
++ TRACE_IRQS_OFF
++ movl %esp, %eax
++ /* Set up the 16bit stack frame with switch32 pointer on top,
++ * and a switch16 pointer on top of the current frame. */
++ call setup_x86_bogus_stack
++ CFI_ADJUST_CFA_OFFSET -8 # frame has moved
++ TRACE_IRQS_IRET
++ RESTORE_REGS
++ lss 20+4(%esp), %esp # switch to 16bit stack
++1: iret
++.section __ex_table,"a"
++ .align 4
++ .long 1b,iret_exc
++.previous
++#else
++ ALIGN
++restore_all_enable_events:
++ TRACE_IRQS_ON
++ __ENABLE_INTERRUPTS
++scrit: /**** START OF CRITICAL REGION ****/
++ __TEST_PENDING
++ jnz 14f # process more events if necessary...
++ RESTORE_REGS
++ addl $4, %esp
++ CFI_ADJUST_CFA_OFFSET -4
++1: iret
++.section __ex_table,"a"
++ .align 4
++ .long 1b,iret_exc
++.previous
++14: __DISABLE_INTERRUPTS
++ TRACE_IRQS_OFF
++ jmp 11f
++ecrit: /**** END OF CRITICAL REGION ****/
++
++ CFI_RESTORE_STATE
++hypervisor_iret:
++ andl $~NMI_MASK, EFLAGS(%esp)
++ RESTORE_REGS
++ addl $4, %esp
++ CFI_ADJUST_CFA_OFFSET -4
++ jmp hypercall_page + (__HYPERVISOR_iret * 32)
++#endif
++ CFI_ENDPROC
++
++ # perform work that needs to be done immediately before resumption
++ ALIGN
++ RING0_PTREGS_FRAME # can't unwind into user space anyway
++work_pending:
++ testb $_TIF_NEED_RESCHED, %cl
++ jz work_notifysig
++work_resched:
++ call schedule
++ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
++ # setting need_resched or sigpending
++ # between sampling and the iret
++ TRACE_IRQS_OFF
++ movl TI_flags(%ebp), %ecx
++ andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
++ # than syscall tracing?
++ jz restore_all
++ testb $_TIF_NEED_RESCHED, %cl
++ jnz work_resched
++
++work_notifysig: # deal with pending signals and
++ # notify-resume requests
++ testl $VM_MASK, EFLAGS(%esp)
++ movl %esp, %eax
++ jne work_notifysig_v86 # returning to kernel-space or
++ # vm86-space
++ xorl %edx, %edx
++ call do_notify_resume
++ jmp resume_userspace_sig
++
++ ALIGN
++work_notifysig_v86:
++#ifdef CONFIG_VM86
++ pushl %ecx # save ti_flags for do_notify_resume
++ CFI_ADJUST_CFA_OFFSET 4
++ call save_v86_state # %eax contains pt_regs pointer
++ popl %ecx
++ CFI_ADJUST_CFA_OFFSET -4
++ movl %eax, %esp
++ xorl %edx, %edx
++ call do_notify_resume
++ jmp resume_userspace_sig
++#endif
++
++ # perform syscall exit tracing
++ ALIGN
++syscall_trace_entry:
++ movl $-ENOSYS,EAX(%esp)
++ movl %esp, %eax
++ xorl %edx,%edx
++ call do_syscall_trace
++ cmpl $0, %eax
++ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
++ # so must skip actual syscall
++ movl ORIG_EAX(%esp), %eax
++ cmpl $(nr_syscalls), %eax
++ jnae syscall_call
++ jmp syscall_exit
++
++ # perform syscall exit tracing
++ ALIGN
++syscall_exit_work:
++ testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
++ jz work_pending
++ TRACE_IRQS_ON
++ ENABLE_INTERRUPTS # could let do_syscall_trace() call
++ # schedule() instead
++ movl %esp, %eax
++ movl $1, %edx
++ call do_syscall_trace
++ jmp resume_userspace
++ CFI_ENDPROC
++
++ RING0_INT_FRAME # can't unwind into user space anyway
++syscall_fault:
++ pushl %eax # save orig_eax
++ CFI_ADJUST_CFA_OFFSET 4
++ SAVE_ALL
++ GET_THREAD_INFO(%ebp)
++ movl $-EFAULT,EAX(%esp)
++ jmp resume_userspace
++
++syscall_badsys:
++ movl $-ENOSYS,EAX(%esp)
++ jmp resume_userspace
++ CFI_ENDPROC
++
++#ifndef CONFIG_XEN
++#define FIXUP_ESPFIX_STACK \
++ movl %esp, %eax; \
++ /* switch to 32bit stack using the pointer on top of 16bit stack */ \
++ lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
++ /* copy data from 16bit stack to 32bit stack */ \
++ call fixup_x86_bogus_stack; \
++ /* put ESP to the proper location */ \
++ movl %eax, %esp;
++#define UNWIND_ESPFIX_STACK \
++ pushl %eax; \
++ CFI_ADJUST_CFA_OFFSET 4; \
++ movl %ss, %eax; \
++ /* see if on 16bit stack */ \
++ cmpw $__ESPFIX_SS, %ax; \
++ je 28f; \
++27: popl %eax; \
++ CFI_ADJUST_CFA_OFFSET -4; \
++.section .fixup,"ax"; \
++28: movl $__KERNEL_DS, %eax; \
++ movl %eax, %ds; \
++ movl %eax, %es; \
++ /* switch to 32bit stack */ \
++ FIXUP_ESPFIX_STACK; \
++ jmp 27b; \
++.previous
++
++/*
++ * Build the entry stubs and pointer table with
++ * some assembler magic.
++ */
++.data
++ENTRY(interrupt)
++.text
++
++vector=0
++ENTRY(irq_entries_start)
++ RING0_INT_FRAME
++.rept NR_IRQS
++ ALIGN
++ .if vector
++ CFI_ADJUST_CFA_OFFSET -4
++ .endif
++1: pushl $~(vector)
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp common_interrupt
++.data
++ .long 1b
++.text
++vector=vector+1
++.endr
++
++/*
++ * the CPU automatically disables interrupts when executing an IRQ vector,
++ * so IRQ-flags tracing has to follow that:
++ */
++ ALIGN
++common_interrupt:
++ SAVE_ALL
++ TRACE_IRQS_OFF
++ movl %esp,%eax
++ call do_IRQ
++ jmp ret_from_intr
++ CFI_ENDPROC
++
++#define BUILD_INTERRUPT(name, nr) \
++ENTRY(name) \
++ RING0_INT_FRAME; \
++ pushl $~(nr); \
++ CFI_ADJUST_CFA_OFFSET 4; \
++ SAVE_ALL; \
++ TRACE_IRQS_OFF \
++ movl %esp,%eax; \
++ call smp_/**/name; \
++ jmp ret_from_intr; \
++ CFI_ENDPROC
++
++/* The include is where all of the SMP etc. interrupts come from */
++#include "entry_arch.h"
++#else
++#define UNWIND_ESPFIX_STACK
++#endif
++
++ENTRY(divide_error)
++ RING0_INT_FRAME
++ pushl $0 # no error code
++ CFI_ADJUST_CFA_OFFSET 4
++ pushl $do_divide_error
++ CFI_ADJUST_CFA_OFFSET 4
++ ALIGN
++error_code:
++ pushl %ds
++ CFI_ADJUST_CFA_OFFSET 4
++ /*CFI_REL_OFFSET ds, 0*/
++ pushl %eax
++ CFI_ADJUST_CFA_OFFSET 4
++ CFI_REL_OFFSET eax, 0
++ xorl %eax, %eax
++ pushl %ebp
++ CFI_ADJUST_CFA_OFFSET 4
++ CFI_REL_OFFSET ebp, 0
++ pushl %edi
++ CFI_ADJUST_CFA_OFFSET 4
++ CFI_REL_OFFSET edi, 0
++ pushl %esi
++ CFI_ADJUST_CFA_OFFSET 4
++ CFI_REL_OFFSET esi, 0
++ pushl %edx
++ CFI_ADJUST_CFA_OFFSET 4
++ CFI_REL_OFFSET edx, 0
++ decl %eax # eax = -1
++ pushl %ecx
++ CFI_ADJUST_CFA_OFFSET 4
++ CFI_REL_OFFSET ecx, 0
++ pushl %ebx
++ CFI_ADJUST_CFA_OFFSET 4
++ CFI_REL_OFFSET ebx, 0
++ cld
++ pushl %es
++ CFI_ADJUST_CFA_OFFSET 4
++ /*CFI_REL_OFFSET es, 0*/
++ UNWIND_ESPFIX_STACK
++ popl %ecx
++ CFI_ADJUST_CFA_OFFSET -4
++ /*CFI_REGISTER es, ecx*/
++ movl ES(%esp), %edi # get the function address
++ movl ORIG_EAX(%esp), %edx # get the error code
++ movl %eax, ORIG_EAX(%esp)
++ movl %ecx, ES(%esp)
++ /*CFI_REL_OFFSET es, ES*/
++ movl $(__USER_DS), %ecx
++ movl %ecx, %ds
++ movl %ecx, %es
++ movl %esp,%eax # pt_regs pointer
++ call *%edi
++ jmp ret_from_exception
++ CFI_ENDPROC
++
++#ifdef CONFIG_XEN
++# A note on the "critical region" in our callback handler.
++# We want to avoid stacking callback handlers due to events occurring
++# during handling of the last event. To do this, we keep events disabled
++# until we've done all processing. HOWEVER, we must enable events before
++# popping the stack frame (can't be done atomically) and so it would still
++# be possible to get enough handler activations to overflow the stack.
++# Although unlikely, bugs of that kind are hard to track down, so we'd
++# like to avoid the possibility.
++# So, on entry to the handler we detect whether we interrupted an
++# existing activation in its critical region -- if so, we pop the current
++# activation and restart the handler using the previous one.
++#
++# The sysexit critical region is slightly different. sysexit
++# atomically removes the entire stack frame. If we interrupt in the
++# critical region we know that the entire frame is present and correct
++# so we can simply throw away the new one.
++ENTRY(hypervisor_callback)
++ RING0_INT_FRAME
++ pushl %eax
++ CFI_ADJUST_CFA_OFFSET 4
++ SAVE_ALL
++ movl EIP(%esp),%eax
++ cmpl $scrit,%eax
++ jb 11f
++ cmpl $ecrit,%eax
++ jb critical_region_fixup
++ cmpl $sysexit_scrit,%eax
++ jb 11f
++ cmpl $sysexit_ecrit,%eax
++ ja 11f
++ addl $OLDESP,%esp # Remove eflags...ebx from stack frame.
++11: push %esp
++ CFI_ADJUST_CFA_OFFSET 4
++ call evtchn_do_upcall
++ add $4,%esp
++ CFI_ADJUST_CFA_OFFSET -4
++ jmp ret_from_intr
++ CFI_ENDPROC
++
++# [How we do the fixup]. We want to merge the current stack frame with the
++# just-interrupted frame. How we do this depends on where in the critical
++# region the interrupted handler was executing, and so how many saved
++# registers are in each frame. We do this quickly using the lookup table
++# 'critical_fixup_table'. For each byte offset in the critical region, it
++# provides the number of bytes which have already been popped from the
++# interrupted stack frame.
++critical_region_fixup:
++ movzbl critical_fixup_table-scrit(%eax),%ecx # %eax contains num bytes popped
++ cmpb $0xff,%cl # 0xff => vcpu_info critical region
++ jne 15f
++ xorl %ecx,%ecx
++15: leal (%esp,%ecx),%esi # %esi points at end of src region
++ leal OLDESP(%esp),%edi # %edi points at end of dst region
++ shrl $2,%ecx # convert words to bytes
++ je 17f # skip loop if nothing to copy
++16: subl $4,%esi # pre-decrementing copy loop
++ subl $4,%edi
++ movl (%esi),%eax
++ movl %eax,(%edi)
++ loop 16b
++17: movl %edi,%esp # final %edi is top of merged stack
++ jmp 11b
++
++.section .rodata,"a"
++critical_fixup_table:
++ .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING
++ .byte 0xff,0xff # jnz 14f
++ .byte 0x00 # pop %ebx
++ .byte 0x04 # pop %ecx
++ .byte 0x08 # pop %edx
++ .byte 0x0c # pop %esi
++ .byte 0x10 # pop %edi
++ .byte 0x14 # pop %ebp
++ .byte 0x18 # pop %eax
++ .byte 0x1c # pop %ds
++ .byte 0x20 # pop %es
++ .byte 0x24,0x24,0x24 # add $4,%esp
++ .byte 0x28 # iret
++ .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi)
++ .byte 0x00,0x00 # jmp 11b
++.previous
++
++# Hypervisor uses this for application faults while it executes.
++# We get here for two reasons:
++# 1. Fault while reloading DS, ES, FS or GS
++# 2. Fault while executing IRET
++# Category 1 we fix up by reattempting the load, and zeroing the segment
++# register if the load fails.
++# Category 2 we fix up by jumping to do_iret_error. We cannot use the
++# normal Linux return path in this case because if we use the IRET hypercall
++# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
++# We distinguish between categories by maintaining a status value in EAX.
++ENTRY(failsafe_callback)
++ pushl %eax
++ movl $1,%eax
++1: mov 4(%esp),%ds
++2: mov 8(%esp),%es
++3: mov 12(%esp),%fs
++4: mov 16(%esp),%gs
++ testl %eax,%eax
++ popl %eax
++ jz 5f
++ addl $16,%esp # EAX != 0 => Category 2 (Bad IRET)
++ jmp iret_exc
++5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment)
++ RING0_INT_FRAME
++ pushl $0
++ SAVE_ALL
++ jmp ret_from_exception
++.section .fixup,"ax"; \
++6: xorl %eax,%eax; \
++ movl %eax,4(%esp); \
++ jmp 1b; \
++7: xorl %eax,%eax; \
++ movl %eax,8(%esp); \
++ jmp 2b; \
++8: xorl %eax,%eax; \
++ movl %eax,12(%esp); \
++ jmp 3b; \
++9: xorl %eax,%eax; \
++ movl %eax,16(%esp); \
++ jmp 4b; \
++.previous; \
++.section __ex_table,"a"; \
++ .align 4; \
++ .long 1b,6b; \
++ .long 2b,7b; \
++ .long 3b,8b; \
++ .long 4b,9b; \
++.previous
++#endif
++ CFI_ENDPROC
++
++ENTRY(coprocessor_error)
++ RING0_INT_FRAME
++ pushl $0
++ CFI_ADJUST_CFA_OFFSET 4
++ pushl $do_coprocessor_error
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++
++ENTRY(simd_coprocessor_error)
++ RING0_INT_FRAME
++ pushl $0
++ CFI_ADJUST_CFA_OFFSET 4
++ pushl $do_simd_coprocessor_error
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++
++ENTRY(device_not_available)
++ RING0_INT_FRAME
++ pushl $-1 # mark this as an int
++ CFI_ADJUST_CFA_OFFSET 4
++ SAVE_ALL
++#ifndef CONFIG_XEN
++ movl %cr0, %eax
++ testl $0x4, %eax # EM (math emulation bit)
++ je device_available_emulate
++ pushl $0 # temporary storage for ORIG_EIP
++ CFI_ADJUST_CFA_OFFSET 4
++ call math_emulate
++ addl $4, %esp
++ CFI_ADJUST_CFA_OFFSET -4
++ jmp ret_from_exception
++device_available_emulate:
++#endif
++ preempt_stop
++ call math_state_restore
++ jmp ret_from_exception
++ CFI_ENDPROC
++
++#ifndef CONFIG_XEN
++/*
++ * Debug traps and NMI can happen at the one SYSENTER instruction
++ * that sets up the real kernel stack. Check here, since we can't
++ * allow the wrong stack to be used.
++ *
++ * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
++ * already pushed 3 words if it hits on the sysenter instruction:
++ * eflags, cs and eip.
++ *
++ * We just load the right stack, and push the three (known) values
++ * by hand onto the new stack - while updating the return eip past
++ * the instruction that would have done it for sysenter.
++ */
++#define FIX_STACK(offset, ok, label) \
++ cmpw $__KERNEL_CS,4(%esp); \
++ jne ok; \
++label: \
++ movl SYSENTER_stack_esp0+offset(%esp),%esp; \
++ pushfl; \
++ pushl $__KERNEL_CS; \
++ pushl $sysenter_past_esp
++#endif /* CONFIG_XEN */
++
++KPROBE_ENTRY(debug)
++ RING0_INT_FRAME
++#ifndef CONFIG_XEN
++ cmpl $sysenter_entry,(%esp)
++ jne debug_stack_correct
++ FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
++debug_stack_correct:
++#endif /* !CONFIG_XEN */
++ pushl $-1 # mark this as an int
++ CFI_ADJUST_CFA_OFFSET 4
++ SAVE_ALL
++ xorl %edx,%edx # error code 0
++ movl %esp,%eax # pt_regs pointer
++ call do_debug
++ jmp ret_from_exception
++ CFI_ENDPROC
++ .previous .text
++#ifndef CONFIG_XEN
++/*
++ * NMI is doubly nasty. It can happen _while_ we're handling
++ * a debug fault, and the debug fault hasn't yet been able to
++ * clear up the stack. So we first check whether we got an
++ * NMI on the sysenter entry path, but after that we need to
++ * check whether we got an NMI on the debug path where the debug
++ * fault happened on the sysenter path.
++ */
++ENTRY(nmi)
++ RING0_INT_FRAME
++ pushl %eax
++ CFI_ADJUST_CFA_OFFSET 4
++ movl %ss, %eax
++ cmpw $__ESPFIX_SS, %ax
++ popl %eax
++ CFI_ADJUST_CFA_OFFSET -4
++ je nmi_16bit_stack
++ cmpl $sysenter_entry,(%esp)
++ je nmi_stack_fixup
++ pushl %eax
++ CFI_ADJUST_CFA_OFFSET 4
++ movl %esp,%eax
++ /* Do not access memory above the end of our stack page,
++ * it might not exist.
++ */
++ andl $(THREAD_SIZE-1),%eax
++ cmpl $(THREAD_SIZE-20),%eax
++ popl %eax
++ CFI_ADJUST_CFA_OFFSET -4
++ jae nmi_stack_correct
++ cmpl $sysenter_entry,12(%esp)
++ je nmi_debug_stack_check
++nmi_stack_correct:
++ pushl %eax
++ CFI_ADJUST_CFA_OFFSET 4
++ SAVE_ALL
++ xorl %edx,%edx # zero error code
++ movl %esp,%eax # pt_regs pointer
++ call do_nmi
++ jmp restore_nocheck_notrace
++ CFI_ENDPROC
++
++nmi_stack_fixup:
++ FIX_STACK(12,nmi_stack_correct, 1)
++ jmp nmi_stack_correct
++nmi_debug_stack_check:
++ cmpw $__KERNEL_CS,16(%esp)
++ jne nmi_stack_correct
++ cmpl $debug,(%esp)
++ jb nmi_stack_correct
++ cmpl $debug_esp_fix_insn,(%esp)
++ ja nmi_stack_correct
++ FIX_STACK(24,nmi_stack_correct, 1)
++ jmp nmi_stack_correct
++
++nmi_16bit_stack:
++ RING0_INT_FRAME
++ /* create the pointer to lss back */
++ pushl %ss
++ CFI_ADJUST_CFA_OFFSET 4
++ pushl %esp
++ CFI_ADJUST_CFA_OFFSET 4
++ movzwl %sp, %esp
++ addw $4, (%esp)
++ /* copy the iret frame of 12 bytes */
++ .rept 3
++ pushl 16(%esp)
++ CFI_ADJUST_CFA_OFFSET 4
++ .endr
++ pushl %eax
++ CFI_ADJUST_CFA_OFFSET 4
++ SAVE_ALL
++ FIXUP_ESPFIX_STACK # %eax == %esp
++ CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
++ xorl %edx,%edx # zero error code
++ call do_nmi
++ RESTORE_REGS
++ lss 12+4(%esp), %esp # back to 16bit stack
++1: iret
++ CFI_ENDPROC
++.section __ex_table,"a"
++ .align 4
++ .long 1b,iret_exc
++.previous
++#else
++ENTRY(nmi)
++ RING0_INT_FRAME
++ pushl %eax
++ CFI_ADJUST_CFA_OFFSET 4
++ SAVE_ALL
++ xorl %edx,%edx # zero error code
++ movl %esp,%eax # pt_regs pointer
++ call do_nmi
++ orl $NMI_MASK, EFLAGS(%esp)
++ jmp restore_all
++ CFI_ENDPROC
++#endif
++
++KPROBE_ENTRY(int3)
++ RING0_INT_FRAME
++ pushl $-1 # mark this as an int
++ CFI_ADJUST_CFA_OFFSET 4
++ SAVE_ALL
++ xorl %edx,%edx # zero error code
++ movl %esp,%eax # pt_regs pointer
++ call do_int3
++ jmp ret_from_exception
++ CFI_ENDPROC
++ .previous .text
++
++ENTRY(overflow)
++ RING0_INT_FRAME
++ pushl $0
++ CFI_ADJUST_CFA_OFFSET 4
++ pushl $do_overflow
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++
++ENTRY(bounds)
++ RING0_INT_FRAME
++ pushl $0
++ CFI_ADJUST_CFA_OFFSET 4
++ pushl $do_bounds
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++
++ENTRY(invalid_op)
++ RING0_INT_FRAME
++ pushl $0
++ CFI_ADJUST_CFA_OFFSET 4
++ pushl $do_invalid_op
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++
++ENTRY(coprocessor_segment_overrun)
++ RING0_INT_FRAME
++ pushl $0
++ CFI_ADJUST_CFA_OFFSET 4
++ pushl $do_coprocessor_segment_overrun
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++
++ENTRY(invalid_TSS)
++ RING0_EC_FRAME
++ pushl $do_invalid_TSS
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++
++ENTRY(segment_not_present)
++ RING0_EC_FRAME
++ pushl $do_segment_not_present
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++
++ENTRY(stack_segment)
++ RING0_EC_FRAME
++ pushl $do_stack_segment
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++
++KPROBE_ENTRY(general_protection)
++ RING0_EC_FRAME
++ pushl $do_general_protection
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++ .previous .text
++
++ENTRY(alignment_check)
++ RING0_EC_FRAME
++ pushl $do_alignment_check
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++
++KPROBE_ENTRY(page_fault)
++ RING0_EC_FRAME
++ pushl $do_page_fault
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++ .previous .text
++
++#ifdef CONFIG_X86_MCE
++ENTRY(machine_check)
++ RING0_INT_FRAME
++ pushl $0
++ CFI_ADJUST_CFA_OFFSET 4
++ pushl machine_check_vector
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++#endif
++
++#ifndef CONFIG_XEN
++ENTRY(spurious_interrupt_bug)
++ RING0_INT_FRAME
++ pushl $0
++ CFI_ADJUST_CFA_OFFSET 4
++ pushl $do_spurious_interrupt_bug
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++#endif /* !CONFIG_XEN */
++
++#ifdef CONFIG_STACK_UNWIND
++ENTRY(arch_unwind_init_running)
++ CFI_STARTPROC
++ movl 4(%esp), %edx
++ movl (%esp), %ecx
++ leal 4(%esp), %eax
++ movl %ebx, EBX(%edx)
++ xorl %ebx, %ebx
++ movl %ebx, ECX(%edx)
++ movl %ebx, EDX(%edx)
++ movl %esi, ESI(%edx)
++ movl %edi, EDI(%edx)
++ movl %ebp, EBP(%edx)
++ movl %ebx, EAX(%edx)
++ movl $__USER_DS, DS(%edx)
++ movl $__USER_DS, ES(%edx)
++ movl %ebx, ORIG_EAX(%edx)
++ movl %ecx, EIP(%edx)
++ movl 12(%esp), %ecx
++ movl $__KERNEL_CS, CS(%edx)
++ movl %ebx, EFLAGS(%edx)
++ movl %eax, OLDESP(%edx)
++ movl 8(%esp), %eax
++ movl %ecx, 8(%esp)
++ movl EBX(%edx), %ebx
++ movl $__KERNEL_DS, OLDSS(%edx)
++ jmpl *%eax
++ CFI_ENDPROC
++ENDPROC(arch_unwind_init_running)
++#endif
++
++ENTRY(fixup_4gb_segment)
++ RING0_EC_FRAME
++ pushl $do_fixup_4gb_segment
++ CFI_ADJUST_CFA_OFFSET 4
++ jmp error_code
++ CFI_ENDPROC
++
++.section .rodata,"a"
++.align 4
++#include "syscall_table.S"
++
++syscall_table_size=(.-sys_call_table)
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/fixup.c 2007-08-27 14:01:24.000000000 -0400
+@@ -0,0 +1,88 @@
++/******************************************************************************
++ * fixup.c
++ *
++ * Binary-rewriting of certain IA32 instructions, on notification by Xen.
++ * Used to avoid repeated slow emulation of common instructions used by the
++ * user-space TLS (Thread-Local Storage) libraries.
++ *
++ * **** NOTE ****
++ * Issues with the binary rewriting have caused it to be removed. Instead
++ * we rely on Xen's emulator to boot the kernel, and then print a banner
++ * message recommending that the user disables /lib/tls.
++ *
++ * Copyright (c) 2004, K A Fraser
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ */
++
++#include <linux/init.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/kernel.h>
++#include <linux/delay.h>
++#include <linux/version.h>
++
++#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
++
++fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
++{
++ static unsigned long printed = 0;
++ char info[100];
++ int i;
++
++ /* Ignore statically-linked init. */
++ if (current->tgid == 1)
++ return;
++
++ HYPERVISOR_vm_assist(
++ VMASST_CMD_disable, VMASST_TYPE_4gb_segments_notify);
++
++ if (test_and_set_bit(0, &printed))
++ return;
++
++ sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
++
++ DP("");
++ DP("***************************************************************");
++ DP("***************************************************************");
++ DP("** WARNING: Currently emulating unsupported memory accesses **");
++ DP("** in /lib/tls glibc libraries. The emulation is **");
++ DP("** slow. To ensure full performance you should **");
++ DP("** install a 'xen-friendly' (nosegneg) version of **");
++ DP("** the library, or disable tls support by executing **");
++ DP("** the following as root: **");
++ DP("** mv /lib/tls /lib/tls.disabled **");
++ DP("** Offending process: %-38.38s **", info);
++ DP("***************************************************************");
++ DP("***************************************************************");
++ DP("");
++
++ for (i = 5; i > 0; i--) {
++ touch_softlockup_watchdog();
++ printk("Pausing... %d", i);
++ mdelay(1000);
++ printk("\b\b\b\b\b\b\b\b\b\b\b\b");
++ }
++
++ printk("Continuing...\n\n");
++}
++
++static int __init fixup_init(void)
++{
++ HYPERVISOR_vm_assist(
++ VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify);
++ return 0;
++}
++__initcall(fixup_init);
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/head-xen.S 2007-08-27 14:02:03.000000000 -0400
+@@ -0,0 +1,207 @@
++
++
++.text
++#include <linux/elfnote.h>
++#include <linux/threads.h>
++#include <linux/linkage.h>
++#include <asm/segment.h>
++#include <asm/page.h>
++#include <asm/cache.h>
++#include <asm/thread_info.h>
++#include <asm/asm-offsets.h>
++#include <asm/dwarf2.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/elfnote.h>
++
++/*
++ * References to members of the new_cpu_data structure.
++ */
++
++#define X86 new_cpu_data+CPUINFO_x86
++#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
++#define X86_MODEL new_cpu_data+CPUINFO_x86_model
++#define X86_MASK new_cpu_data+CPUINFO_x86_mask
++#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
++#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
++#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
++#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
++
++#define VIRT_ENTRY_OFFSET 0x0
++.org VIRT_ENTRY_OFFSET
++ENTRY(startup_32)
++ movl %esi,xen_start_info
++ cld
++
++ /* Set up the stack pointer */
++ movl $(init_thread_union+THREAD_SIZE),%esp
++
++ /* get vendor info */
++ xorl %eax,%eax # call CPUID with 0 -> return vendor ID
++ XEN_CPUID
++ movl %eax,X86_CPUID # save CPUID level
++ movl %ebx,X86_VENDOR_ID # lo 4 chars
++ movl %edx,X86_VENDOR_ID+4 # next 4 chars
++ movl %ecx,X86_VENDOR_ID+8 # last 4 chars
++
++ movl $1,%eax # Use the CPUID instruction to get CPU type
++ XEN_CPUID
++ movb %al,%cl # save reg for future use
++ andb $0x0f,%ah # mask processor family
++ movb %ah,X86
++ andb $0xf0,%al # mask model
++ shrb $4,%al
++ movb %al,X86_MODEL
++ andb $0x0f,%cl # mask mask revision
++ movb %cl,X86_MASK
++ movl %edx,X86_CAPABILITY
++
++ movb $1,X86_HARD_MATH
++
++ xorl %eax,%eax # Clear FS/GS and LDT
++ movl %eax,%fs
++ movl %eax,%gs
++ cld # gcc2 wants the direction flag cleared at all times
++
++ pushl %eax # fake return address
++ jmp start_kernel
++
++#define HYPERCALL_PAGE_OFFSET 0x1000
++.org HYPERCALL_PAGE_OFFSET
++ENTRY(hypercall_page)
++ CFI_STARTPROC
++.skip 0x1000
++ CFI_ENDPROC
++
++/*
++ * Real beginning of normal "text" segment
++ */
++ENTRY(stext)
++ENTRY(_stext)
++
++/*
++ * BSS section
++ */
++.section ".bss.page_aligned","w"
++ENTRY(empty_zero_page)
++ .fill 4096,1,0
++
++/*
++ * This starts the data section.
++ */
++.data
++
++/*
++ * The Global Descriptor Table contains 28 quadwords, per-CPU.
++ */
++ .align L1_CACHE_BYTES
++ENTRY(cpu_gdt_table)
++ .quad 0x0000000000000000 /* NULL descriptor */
++ .quad 0x0000000000000000 /* 0x0b reserved */
++ .quad 0x0000000000000000 /* 0x13 reserved */
++ .quad 0x0000000000000000 /* 0x1b reserved */
++ .quad 0x0000000000000000 /* 0x20 unused */
++ .quad 0x0000000000000000 /* 0x28 unused */
++ .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
++ .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
++ .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
++ .quad 0x0000000000000000 /* 0x4b reserved */
++ .quad 0x0000000000000000 /* 0x53 reserved */
++ .quad 0x0000000000000000 /* 0x5b reserved */
++
++ .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
++ .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
++ .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
++ .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
++
++ .quad 0x0000000000000000 /* 0x80 TSS descriptor */
++ .quad 0x0000000000000000 /* 0x88 LDT descriptor */
++
++ /*
++ * Segments used for calling PnP BIOS have byte granularity.
++ * They code segments and data segments have fixed 64k limits,
++ * the transfer segment sizes are set at run time.
++ */
++ .quad 0x0000000000000000 /* 0x90 32-bit code */
++ .quad 0x0000000000000000 /* 0x98 16-bit code */
++ .quad 0x0000000000000000 /* 0xa0 16-bit data */
++ .quad 0x0000000000000000 /* 0xa8 16-bit data */
++ .quad 0x0000000000000000 /* 0xb0 16-bit data */
++
++ /*
++ * The APM segments have byte granularity and their bases
++ * are set at run time. All have 64k limits.
++ */
++ .quad 0x0000000000000000 /* 0xb8 APM CS code */
++ .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
++ .quad 0x0000000000000000 /* 0xc8 APM DS data */
++
++ .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */
++ .quad 0x0000000000000000 /* 0xd8 - unused */
++ .quad 0x0000000000000000 /* 0xe0 - unused */
++ .quad 0x0000000000000000 /* 0xe8 - unused */
++ .quad 0x0000000000000000 /* 0xf0 - unused */
++ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++/*
++ * __xen_guest information
++ */
++.macro utoa value
++ .if (\value) < 0 || (\value) >= 0x10
++ utoa (((\value)>>4)&0x0fffffff)
++ .endif
++ .if ((\value) & 0xf) < 10
++ .byte '0' + ((\value) & 0xf)
++ .else
++ .byte 'A' + ((\value) & 0xf) - 10
++ .endif
++.endm
++
++.section __xen_guest
++ .ascii "GUEST_OS=linux,GUEST_VER=2.6"
++ .ascii ",XEN_VER=xen-3.0"
++ .ascii ",VIRT_BASE=0x"
++ utoa __PAGE_OFFSET
++ .ascii ",ELF_PADDR_OFFSET=0x"
++ utoa __PAGE_OFFSET
++ .ascii ",VIRT_ENTRY=0x"
++ utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
++ .ascii ",HYPERCALL_PAGE=0x"
++ utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
++ .ascii ",FEATURES=writable_page_tables"
++ .ascii "|writable_descriptor_tables"
++ .ascii "|auto_translated_physmap"
++ .ascii "|pae_pgdir_above_4gb"
++ .ascii "|supervisor_mode_kernel"
++#ifdef CONFIG_X86_PAE
++ .ascii ",PAE=yes[extended-cr3]"
++#else
++ .ascii ",PAE=no"
++#endif
++ .ascii ",LOADER=generic"
++ .byte 0
++#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
++
++
++ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux")
++ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6")
++ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0")
++ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET)
++#if CONFIG_XEN_COMPAT <= 0x030002
++ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, __PAGE_OFFSET)
++#else
++ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, 0)
++#endif
++ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32)
++ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page)
++ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START)
++ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
++#ifdef CONFIG_X86_PAE
++ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes")
++ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT)
++#else
++ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no")
++ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, _PAGE_PRESENT,_PAGE_PRESENT)
++#endif
++ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic")
++ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1)
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/init_task-xen.c 2007-08-27 14:01:24.000000000 -0400
+@@ -0,0 +1,51 @@
++#include <linux/mm.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/init.h>
++#include <linux/init_task.h>
++#include <linux/fs.h>
++#include <linux/mqueue.h>
++
++#include <asm/uaccess.h>
++#include <asm/pgtable.h>
++#include <asm/desc.h>
++
++static struct fs_struct init_fs = INIT_FS;
++static struct files_struct init_files = INIT_FILES;
++static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
++static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
++
++#define swapper_pg_dir ((pgd_t *)NULL)
++struct mm_struct init_mm = INIT_MM(init_mm);
++#undef swapper_pg_dir
++
++EXPORT_SYMBOL(init_mm);
++
++/*
++ * Initial thread structure.
++ *
++ * We need to make sure that this is THREAD_SIZE aligned due to the
++ * way process stacks are handled. This is done by having a special
++ * "init_task" linker map entry..
++ */
++union thread_union init_thread_union
++ __attribute__((__section__(".data.init_task"))) =
++ { INIT_THREAD_INFO(init_task) };
++
++/*
++ * Initial task structure.
++ *
++ * All other task structs will be allocated on slabs in fork.c
++ */
++struct task_struct init_task = INIT_TASK(init_task);
++
++EXPORT_SYMBOL(init_task);
++
++#ifndef CONFIG_X86_NO_TSS
++/*
++ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
++ * no more per-task TSS's.
++ */
++DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
++#endif
++
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/io_apic-xen.c 2007-08-27 14:02:03.000000000 -0400
+@@ -0,0 +1,2777 @@
++/*
++ * Intel IO-APIC support for multi-Pentium hosts.
++ *
++ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
++ *
++ * Many thanks to Stig Venaas for trying out countless experimental
++ * patches and reporting/debugging problems patiently!
++ *
++ * (c) 1999, Multiple IO-APIC support, developed by
++ * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
++ * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
++ * further tested and cleaned up by Zach Brown <zab@redhat.com>
++ * and Ingo Molnar <mingo@redhat.com>
++ *
++ * Fixes
++ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
++ * thanks to Eric Gilmore
++ * and Rolf G. Tews
++ * for testing these extensively
++ * Paul Diefenbaugh : Added full ACPI support
++ */
++
++#include <linux/mm.h>
++#include <linux/interrupt.h>
++#include <linux/init.h>
++#include <linux/delay.h>
++#include <linux/sched.h>
++#include <linux/smp_lock.h>
++#include <linux/mc146818rtc.h>
++#include <linux/compiler.h>
++#include <linux/acpi.h>
++#include <linux/module.h>
++#include <linux/sysdev.h>
++
++#include <asm/io.h>
++#include <asm/smp.h>
++#include <asm/desc.h>
++#include <asm/timer.h>
++#include <asm/i8259.h>
++#include <asm/nmi.h>
++
++#include <mach_apic.h>
++
++#include "io_ports.h"
++
++#ifdef CONFIG_XEN
++
++#include <xen/interface/xen.h>
++#include <xen/interface/physdev.h>
++
++/* Fake i8259 */
++#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
++#define disable_8259A_irq(_irq) ((void)0)
++#define i8259A_irq_pending(_irq) (0)
++
++unsigned long io_apic_irqs;
++
++static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
++{
++ struct physdev_apic apic_op;
++ int ret;
++
++ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
++ apic_op.reg = reg;
++ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
++ if (ret)
++ return ret;
++ return apic_op.value;
++}
++
++static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
++{
++ struct physdev_apic apic_op;
++
++ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
++ apic_op.reg = reg;
++ apic_op.value = value;
++ HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
++}
++
++#define io_apic_read(a,r) xen_io_apic_read(a,r)
++#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
++
++#endif /* CONFIG_XEN */
++
++int (*ioapic_renumber_irq)(int ioapic, int irq);
++atomic_t irq_mis_count;
++
++/* Where if anywhere is the i8259 connect in external int mode */
++static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
++
++static DEFINE_SPINLOCK(ioapic_lock);
++static DEFINE_SPINLOCK(vector_lock);
++
++int timer_over_8254 __initdata = 1;
++
++/*
++ * Is the SiS APIC rmw bug present ?
++ * -1 = don't know, 0 = no, 1 = yes
++ */
++int sis_apic_bug = -1;
++
++/*
++ * # of IRQ routing registers
++ */
++int nr_ioapic_registers[MAX_IO_APICS];
++
++int disable_timer_pin_1 __initdata;
++
++/*
++ * Rough estimation of how many shared IRQs there are, can
++ * be changed anytime.
++ */
++#define MAX_PLUS_SHARED_IRQS NR_IRQS
++#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
++
++/*
++ * This is performance-critical, we want to do it O(1)
++ *
++ * the indexing order of this array favors 1:1 mappings
++ * between pins and IRQs.
++ */
++
++static struct irq_pin_list {
++ int apic, pin, next;
++} irq_2_pin[PIN_MAP_SIZE];
++
++int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
++#ifdef CONFIG_PCI_MSI
++#define vector_to_irq(vector) \
++ (platform_legacy_irq(vector) ? vector : vector_irq[vector])
++#else
++#define vector_to_irq(vector) (vector)
++#endif
++
++/*
++ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
++ * shared ISA-space IRQs, so we have to support them. We are super
++ * fast in the common case, and fast for shared ISA-space IRQs.
++ */
++static void add_pin_to_irq(unsigned int irq, int apic, int pin)
++{
++ static int first_free_entry = NR_IRQS;
++ struct irq_pin_list *entry = irq_2_pin + irq;
++
++ while (entry->next)
++ entry = irq_2_pin + entry->next;
++
++ if (entry->pin != -1) {
++ entry->next = first_free_entry;
++ entry = irq_2_pin + entry->next;
++ if (++first_free_entry >= PIN_MAP_SIZE)
++ panic("io_apic.c: whoops");
++ }
++ entry->apic = apic;
++ entry->pin = pin;
++}
++
++#ifdef CONFIG_XEN
++#define clear_IO_APIC() ((void)0)
++#else
++/*
++ * Reroute an IRQ to a different pin.
++ */
++static void __init replace_pin_at_irq(unsigned int irq,
++ int oldapic, int oldpin,
++ int newapic, int newpin)
++{
++ struct irq_pin_list *entry = irq_2_pin + irq;
++
++ while (1) {
++ if (entry->apic == oldapic && entry->pin == oldpin) {
++ entry->apic = newapic;
++ entry->pin = newpin;
++ }
++ if (!entry->next)
++ break;
++ entry = irq_2_pin + entry->next;
++ }
++}
++
++static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
++{
++ struct irq_pin_list *entry = irq_2_pin + irq;
++ unsigned int pin, reg;
++
++ for (;;) {
++ pin = entry->pin;
++ if (pin == -1)
++ break;
++ reg = io_apic_read(entry->apic, 0x10 + pin*2);
++ reg &= ~disable;
++ reg |= enable;
++ io_apic_modify(entry->apic, 0x10 + pin*2, reg);
++ if (!entry->next)
++ break;
++ entry = irq_2_pin + entry->next;
++ }
++}
++
++/* mask = 1 */
++static void __mask_IO_APIC_irq (unsigned int irq)
++{
++ __modify_IO_APIC_irq(irq, 0x00010000, 0);
++}
++
++/* mask = 0 */
++static void __unmask_IO_APIC_irq (unsigned int irq)
++{
++ __modify_IO_APIC_irq(irq, 0, 0x00010000);
++}
++
++/* mask = 1, trigger = 0 */
++static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
++{
++ __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
++}
++
++/* mask = 0, trigger = 1 */
++static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
++{
++ __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
++}
++
++static void mask_IO_APIC_irq (unsigned int irq)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ __mask_IO_APIC_irq(irq);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++}
++
++static void unmask_IO_APIC_irq (unsigned int irq)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ __unmask_IO_APIC_irq(irq);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++}
++
++static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
++{
++ struct IO_APIC_route_entry entry;
++ unsigned long flags;
++
++ /* Check delivery_mode to be sure we're not clearing an SMI pin */
++ spin_lock_irqsave(&ioapic_lock, flags);
++ *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
++ *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++ if (entry.delivery_mode == dest_SMI)
++ return;
++
++ /*
++ * Disable it in the IO-APIC irq-routing table:
++ */
++ memset(&entry, 0, sizeof(entry));
++ entry.mask = 1;
++ spin_lock_irqsave(&ioapic_lock, flags);
++ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
++ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++}
++
++static void clear_IO_APIC (void)
++{
++ int apic, pin;
++
++ for (apic = 0; apic < nr_ioapics; apic++)
++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
++ clear_IO_APIC_pin(apic, pin);
++}
++
++#ifdef CONFIG_SMP
++static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
++{
++ unsigned long flags;
++ int pin;
++ struct irq_pin_list *entry = irq_2_pin + irq;
++ unsigned int apicid_value;
++ cpumask_t tmp;
++
++ cpus_and(tmp, cpumask, cpu_online_map);
++ if (cpus_empty(tmp))
++ tmp = TARGET_CPUS;
++
++ cpus_and(cpumask, tmp, CPU_MASK_ALL);
++
++ apicid_value = cpu_mask_to_apicid(cpumask);
++ /* Prepare to do the io_apic_write */
++ apicid_value = apicid_value << 24;
++ spin_lock_irqsave(&ioapic_lock, flags);
++ for (;;) {
++ pin = entry->pin;
++ if (pin == -1)
++ break;
++ io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
++ if (!entry->next)
++ break;
++ entry = irq_2_pin + entry->next;
++ }
++ set_irq_info(irq, cpumask);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++}
++
++#if defined(CONFIG_IRQBALANCE)
++# include <asm/processor.h> /* kernel_thread() */
++# include <linux/kernel_stat.h> /* kstat */
++# include <linux/slab.h> /* kmalloc() */
++# include <linux/timer.h> /* time_after() */
++
++#ifdef CONFIG_BALANCED_IRQ_DEBUG
++# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
++# define Dprintk(x...) do { TDprintk(x); } while (0)
++# else
++# define TDprintk(x...)
++# define Dprintk(x...)
++# endif
++
++#define IRQBALANCE_CHECK_ARCH -999
++#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
++#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
++#define BALANCED_IRQ_MORE_DELTA (HZ/10)
++#define BALANCED_IRQ_LESS_DELTA (HZ)
++
++static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
++static int physical_balance __read_mostly;
++static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
++
++static struct irq_cpu_info {
++ unsigned long * last_irq;
++ unsigned long * irq_delta;
++ unsigned long irq;
++} irq_cpu_data[NR_CPUS];
++
++#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
++#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
++#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
++
++#define IDLE_ENOUGH(cpu,now) \
++ (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
++
++#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
++
++#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
++
++static cpumask_t balance_irq_affinity[NR_IRQS] = {
++ [0 ... NR_IRQS-1] = CPU_MASK_ALL
++};
++
++void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
++{
++ balance_irq_affinity[irq] = mask;
++}
++
++static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
++ unsigned long now, int direction)
++{
++ int search_idle = 1;
++ int cpu = curr_cpu;
++
++ goto inside;
++
++ do {
++ if (unlikely(cpu == curr_cpu))
++ search_idle = 0;
++inside:
++ if (direction == 1) {
++ cpu++;
++ if (cpu >= NR_CPUS)
++ cpu = 0;
++ } else {
++ cpu--;
++ if (cpu == -1)
++ cpu = NR_CPUS-1;
++ }
++ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
++ (search_idle && !IDLE_ENOUGH(cpu,now)));
++
++ return cpu;
++}
++
++static inline void balance_irq(int cpu, int irq)
++{
++ unsigned long now = jiffies;
++ cpumask_t allowed_mask;
++ unsigned int new_cpu;
++
++ if (irqbalance_disabled)
++ return;
++
++ cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
++ new_cpu = move(cpu, allowed_mask, now, 1);
++ if (cpu != new_cpu) {
++ set_pending_irq(irq, cpumask_of_cpu(new_cpu));
++ }
++}
++
++static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
++{
++ int i, j;
++ Dprintk("Rotating IRQs among CPUs.\n");
++ for_each_online_cpu(i) {
++ for (j = 0; j < NR_IRQS; j++) {
++ if (!irq_desc[j].action)
++ continue;
++ /* Is it a significant load ? */
++ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
++ useful_load_threshold)
++ continue;
++ balance_irq(i, j);
++ }
++ }
++ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
++ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
++ return;
++}
++
++static void do_irq_balance(void)
++{
++ int i, j;
++ unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
++ unsigned long move_this_load = 0;
++ int max_loaded = 0, min_loaded = 0;
++ int load;
++ unsigned long useful_load_threshold = balanced_irq_interval + 10;
++ int selected_irq;
++ int tmp_loaded, first_attempt = 1;
++ unsigned long tmp_cpu_irq;
++ unsigned long imbalance = 0;
++ cpumask_t allowed_mask, target_cpu_mask, tmp;
++
++ for_each_possible_cpu(i) {
++ int package_index;
++ CPU_IRQ(i) = 0;
++ if (!cpu_online(i))
++ continue;
++ package_index = CPU_TO_PACKAGEINDEX(i);
++ for (j = 0; j < NR_IRQS; j++) {
++ unsigned long value_now, delta;
++ /* Is this an active IRQ? */
++ if (!irq_desc[j].action)
++ continue;
++ if ( package_index == i )
++ IRQ_DELTA(package_index,j) = 0;
++ /* Determine the total count per processor per IRQ */
++ value_now = (unsigned long) kstat_cpu(i).irqs[j];
++
++ /* Determine the activity per processor per IRQ */
++ delta = value_now - LAST_CPU_IRQ(i,j);
++
++ /* Update last_cpu_irq[][] for the next time */
++ LAST_CPU_IRQ(i,j) = value_now;
++
++ /* Ignore IRQs whose rate is less than the clock */
++ if (delta < useful_load_threshold)
++ continue;
++ /* update the load for the processor or package total */
++ IRQ_DELTA(package_index,j) += delta;
++
++ /* Keep track of the higher numbered sibling as well */
++ if (i != package_index)
++ CPU_IRQ(i) += delta;
++ /*
++ * We have sibling A and sibling B in the package
++ *
++ * cpu_irq[A] = load for cpu A + load for cpu B
++ * cpu_irq[B] = load for cpu B
++ */
++ CPU_IRQ(package_index) += delta;
++ }
++ }
++ /* Find the least loaded processor package */
++ for_each_online_cpu(i) {
++ if (i != CPU_TO_PACKAGEINDEX(i))
++ continue;
++ if (min_cpu_irq > CPU_IRQ(i)) {
++ min_cpu_irq = CPU_IRQ(i);
++ min_loaded = i;
++ }
++ }
++ max_cpu_irq = ULONG_MAX;
++
++tryanothercpu:
++ /* Look for heaviest loaded processor.
++ * We may come back to get the next heaviest loaded processor.
++ * Skip processors with trivial loads.
++ */
++ tmp_cpu_irq = 0;
++ tmp_loaded = -1;
++ for_each_online_cpu(i) {
++ if (i != CPU_TO_PACKAGEINDEX(i))
++ continue;
++ if (max_cpu_irq <= CPU_IRQ(i))
++ continue;
++ if (tmp_cpu_irq < CPU_IRQ(i)) {
++ tmp_cpu_irq = CPU_IRQ(i);
++ tmp_loaded = i;
++ }
++ }
++
++ if (tmp_loaded == -1) {
++ /* In the case of small number of heavy interrupt sources,
++ * loading some of the cpus too much. We use Ingo's original
++ * approach to rotate them around.
++ */
++ if (!first_attempt && imbalance >= useful_load_threshold) {
++ rotate_irqs_among_cpus(useful_load_threshold);
++ return;
++ }
++ goto not_worth_the_effort;
++ }
++
++ first_attempt = 0; /* heaviest search */
++ max_cpu_irq = tmp_cpu_irq; /* load */
++ max_loaded = tmp_loaded; /* processor */
++ imbalance = (max_cpu_irq - min_cpu_irq) / 2;
++
++ Dprintk("max_loaded cpu = %d\n", max_loaded);
++ Dprintk("min_loaded cpu = %d\n", min_loaded);
++ Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
++ Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
++ Dprintk("load imbalance = %lu\n", imbalance);
++
++ /* if imbalance is less than approx 10% of max load, then
++ * observe diminishing returns action. - quit
++ */
++ if (imbalance < (max_cpu_irq >> 3)) {
++ Dprintk("Imbalance too trivial\n");
++ goto not_worth_the_effort;
++ }
++
++tryanotherirq:
++ /* if we select an IRQ to move that can't go where we want, then
++ * see if there is another one to try.
++ */
++ move_this_load = 0;
++ selected_irq = -1;
++ for (j = 0; j < NR_IRQS; j++) {
++ /* Is this an active IRQ? */
++ if (!irq_desc[j].action)
++ continue;
++ if (imbalance <= IRQ_DELTA(max_loaded,j))
++ continue;
++ /* Try to find the IRQ that is closest to the imbalance
++ * without going over.
++ */
++ if (move_this_load < IRQ_DELTA(max_loaded,j)) {
++ move_this_load = IRQ_DELTA(max_loaded,j);
++ selected_irq = j;
++ }
++ }
++ if (selected_irq == -1) {
++ goto tryanothercpu;
++ }
++
++ imbalance = move_this_load;
++
++ /* For physical_balance case, we accumlated both load
++ * values in the one of the siblings cpu_irq[],
++ * to use the same code for physical and logical processors
++ * as much as possible.
++ *
++ * NOTE: the cpu_irq[] array holds the sum of the load for
++ * sibling A and sibling B in the slot for the lowest numbered
++ * sibling (A), _AND_ the load for sibling B in the slot for
++ * the higher numbered sibling.
++ *
++ * We seek the least loaded sibling by making the comparison
++ * (A+B)/2 vs B
++ */
++ load = CPU_IRQ(min_loaded) >> 1;
++ for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
++ if (load > CPU_IRQ(j)) {
++ /* This won't change cpu_sibling_map[min_loaded] */
++ load = CPU_IRQ(j);
++ min_loaded = j;
++ }
++ }
++
++ cpus_and(allowed_mask,
++ cpu_online_map,
++ balance_irq_affinity[selected_irq]);
++ target_cpu_mask = cpumask_of_cpu(min_loaded);
++ cpus_and(tmp, target_cpu_mask, allowed_mask);
++
++ if (!cpus_empty(tmp)) {
++
++ Dprintk("irq = %d moved to cpu = %d\n",
++ selected_irq, min_loaded);
++ /* mark for change destination */
++ set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
++
++ /* Since we made a change, come back sooner to
++ * check for more variation.
++ */
++ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
++ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
++ return;
++ }
++ goto tryanotherirq;
++
++not_worth_the_effort:
++ /*
++ * if we did not find an IRQ to move, then adjust the time interval
++ * upward
++ */
++ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
++ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
++ Dprintk("IRQ worth rotating not found\n");
++ return;
++}
++
++static int balanced_irq(void *unused)
++{
++ int i;
++ unsigned long prev_balance_time = jiffies;
++ long time_remaining = balanced_irq_interval;
++
++ daemonize("kirqd");
++
++ /* push everything to CPU 0 to give us a starting point. */
++ for (i = 0 ; i < NR_IRQS ; i++) {
++ irq_desc[i].pending_mask = cpumask_of_cpu(0);
++ set_pending_irq(i, cpumask_of_cpu(0));
++ }
++
++ for ( ; ; ) {
++ time_remaining = schedule_timeout_interruptible(time_remaining);
++ try_to_freeze();
++ if (time_after(jiffies,
++ prev_balance_time+balanced_irq_interval)) {
++ preempt_disable();
++ do_irq_balance();
++ prev_balance_time = jiffies;
++ time_remaining = balanced_irq_interval;
++ preempt_enable();
++ }
++ }
++ return 0;
++}
++
++static int __init balanced_irq_init(void)
++{
++ int i;
++ struct cpuinfo_x86 *c;
++ cpumask_t tmp;
++
++ cpus_shift_right(tmp, cpu_online_map, 2);
++ c = &boot_cpu_data;
++ /* When not overwritten by the command line ask subarchitecture. */
++ if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
++ irqbalance_disabled = NO_BALANCE_IRQ;
++ if (irqbalance_disabled)
++ return 0;
++
++ /* disable irqbalance completely if there is only one processor online */
++ if (num_online_cpus() < 2) {
++ irqbalance_disabled = 1;
++ return 0;
++ }
++ /*
++ * Enable physical balance only if more than 1 physical processor
++ * is present
++ */
++ if (smp_num_siblings > 1 && !cpus_empty(tmp))
++ physical_balance = 1;
++
++ for_each_online_cpu(i) {
++ irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
++ irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
++ if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
++ printk(KERN_ERR "balanced_irq_init: out of memory");
++ goto failed;
++ }
++ memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
++ memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
++ }
++
++ printk(KERN_INFO "Starting balanced_irq\n");
++ if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
++ return 0;
++ else
++ printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
++failed:
++ for_each_possible_cpu(i) {
++ kfree(irq_cpu_data[i].irq_delta);
++ irq_cpu_data[i].irq_delta = NULL;
++ kfree(irq_cpu_data[i].last_irq);
++ irq_cpu_data[i].last_irq = NULL;
++ }
++ return 0;
++}
++
++int __init irqbalance_disable(char *str)
++{
++ irqbalance_disabled = 1;
++ return 1;
++}
++
++__setup("noirqbalance", irqbalance_disable);
++
++late_initcall(balanced_irq_init);
++#endif /* CONFIG_IRQBALANCE */
++#endif /* CONFIG_SMP */
++#endif
++
++#ifndef CONFIG_SMP
++void fastcall send_IPI_self(int vector)
++{
++#ifndef CONFIG_XEN
++ unsigned int cfg;
++
++ /*
++ * Wait for idle.
++ */
++ apic_wait_icr_idle();
++ cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
++ /*
++ * Send the IPI. The write to APIC_ICR fires this off.
++ */
++ apic_write_around(APIC_ICR, cfg);
++#endif
++}
++#endif /* !CONFIG_SMP */
++
++
++/*
++ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
++ * specific CPU-side IRQs.
++ */
++
++#define MAX_PIRQS 8
++static int pirq_entries [MAX_PIRQS];
++static int pirqs_enabled;
++int skip_ioapic_setup;
++
++static int __init ioapic_setup(char *str)
++{
++ skip_ioapic_setup = 1;
++ return 1;
++}
++
++__setup("noapic", ioapic_setup);
++
++static int __init ioapic_pirq_setup(char *str)
++{
++ int i, max;
++ int ints[MAX_PIRQS+1];
++
++ get_options(str, ARRAY_SIZE(ints), ints);
++
++ for (i = 0; i < MAX_PIRQS; i++)
++ pirq_entries[i] = -1;
++
++ pirqs_enabled = 1;
++ apic_printk(APIC_VERBOSE, KERN_INFO
++ "PIRQ redirection, working around broken MP-BIOS.\n");
++ max = MAX_PIRQS;
++ if (ints[0] < MAX_PIRQS)
++ max = ints[0];
++
++ for (i = 0; i < max; i++) {
++ apic_printk(APIC_VERBOSE, KERN_DEBUG
++ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
++ /*
++ * PIRQs are mapped upside down, usually.
++ */
++ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
++ }
++ return 1;
++}
++
++__setup("pirq=", ioapic_pirq_setup);
++
++/*
++ * Find the IRQ entry number of a certain pin.
++ */
++static int find_irq_entry(int apic, int pin, int type)
++{
++ int i;
++
++ for (i = 0; i < mp_irq_entries; i++)
++ if (mp_irqs[i].mpc_irqtype == type &&
++ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
++ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
++ mp_irqs[i].mpc_dstirq == pin)
++ return i;
++
++ return -1;
++}
++
++/*
++ * Find the pin to which IRQ[irq] (ISA) is connected
++ */
++static int __init find_isa_irq_pin(int irq, int type)
++{
++ int i;
++
++ for (i = 0; i < mp_irq_entries; i++) {
++ int lbus = mp_irqs[i].mpc_srcbus;
++
++ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
++ mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
++ mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
++ mp_bus_id_to_type[lbus] == MP_BUS_NEC98
++ ) &&
++ (mp_irqs[i].mpc_irqtype == type) &&
++ (mp_irqs[i].mpc_srcbusirq == irq))
++
++ return mp_irqs[i].mpc_dstirq;
++ }
++ return -1;
++}
++
++static int __init find_isa_irq_apic(int irq, int type)
++{
++ int i;
++
++ for (i = 0; i < mp_irq_entries; i++) {
++ int lbus = mp_irqs[i].mpc_srcbus;
++
++ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
++ mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
++ mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
++ mp_bus_id_to_type[lbus] == MP_BUS_NEC98
++ ) &&
++ (mp_irqs[i].mpc_irqtype == type) &&
++ (mp_irqs[i].mpc_srcbusirq == irq))
++ break;
++ }
++ if (i < mp_irq_entries) {
++ int apic;
++ for(apic = 0; apic < nr_ioapics; apic++) {
++ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
++ return apic;
++ }
++ }
++
++ return -1;
++}
++
++/*
++ * Find a specific PCI IRQ entry.
++ * Not an __init, possibly needed by modules
++ */
++static int pin_2_irq(int idx, int apic, int pin);
++
++int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
++{
++ int apic, i, best_guess = -1;
++
++ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
++ "slot:%d, pin:%d.\n", bus, slot, pin);
++ if (mp_bus_id_to_pci_bus[bus] == -1) {
++ printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
++ return -1;
++ }
++ for (i = 0; i < mp_irq_entries; i++) {
++ int lbus = mp_irqs[i].mpc_srcbus;
++
++ for (apic = 0; apic < nr_ioapics; apic++)
++ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
++ mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
++ break;
++
++ if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
++ !mp_irqs[i].mpc_irqtype &&
++ (bus == lbus) &&
++ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
++ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
++
++ if (!(apic || IO_APIC_IRQ(irq)))
++ continue;
++
++ if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
++ return irq;
++ /*
++ * Use the first all-but-pin matching entry as a
++ * best-guess fuzzy result for broken mptables.
++ */
++ if (best_guess < 0)
++ best_guess = irq;
++ }
++ }
++ return best_guess;
++}
++EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
++
++/*
++ * This function currently is only a helper for the i386 smp boot process where
++ * we need to reprogram the ioredtbls to cater for the cpus which have come online
++ * so mask in all cases should simply be TARGET_CPUS
++ */
++#ifdef CONFIG_SMP
++#ifndef CONFIG_XEN
++void __init setup_ioapic_dest(void)
++{
++ int pin, ioapic, irq, irq_entry;
++
++ if (skip_ioapic_setup == 1)
++ return;
++
++ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
++ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
++ irq_entry = find_irq_entry(ioapic, pin, mp_INT);
++ if (irq_entry == -1)
++ continue;
++ irq = pin_2_irq(irq_entry, ioapic, pin);
++ set_ioapic_affinity_irq(irq, TARGET_CPUS);
++ }
++
++ }
++}
++#endif /* !CONFIG_XEN */
++#endif
++
++/*
++ * EISA Edge/Level control register, ELCR
++ */
++static int EISA_ELCR(unsigned int irq)
++{
++ if (irq < 16) {
++ unsigned int port = 0x4d0 + (irq >> 3);
++ return (inb(port) >> (irq & 7)) & 1;
++ }
++ apic_printk(APIC_VERBOSE, KERN_INFO
++ "Broken MPtable reports ISA irq %d\n", irq);
++ return 0;
++}
++
++/* EISA interrupts are always polarity zero and can be edge or level
++ * trigger depending on the ELCR value. If an interrupt is listed as
++ * EISA conforming in the MP table, that means its trigger type must
++ * be read in from the ELCR */
++
++#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
++#define default_EISA_polarity(idx) (0)
++
++/* ISA interrupts are always polarity zero edge triggered,
++ * when listed as conforming in the MP table. */
++
++#define default_ISA_trigger(idx) (0)
++#define default_ISA_polarity(idx) (0)
++
++/* PCI interrupts are always polarity one level triggered,
++ * when listed as conforming in the MP table. */
++
++#define default_PCI_trigger(idx) (1)
++#define default_PCI_polarity(idx) (1)
++
++/* MCA interrupts are always polarity zero level triggered,
++ * when listed as conforming in the MP table. */
++
++#define default_MCA_trigger(idx) (1)
++#define default_MCA_polarity(idx) (0)
++
++/* NEC98 interrupts are always polarity zero edge triggered,
++ * when listed as conforming in the MP table. */
++
++#define default_NEC98_trigger(idx) (0)
++#define default_NEC98_polarity(idx) (0)
++
++static int __init MPBIOS_polarity(int idx)
++{
++ int bus = mp_irqs[idx].mpc_srcbus;
++ int polarity;
++
++ /*
++ * Determine IRQ line polarity (high active or low active):
++ */
++ switch (mp_irqs[idx].mpc_irqflag & 3)
++ {
++ case 0: /* conforms, ie. bus-type dependent polarity */
++ {
++ switch (mp_bus_id_to_type[bus])
++ {
++ case MP_BUS_ISA: /* ISA pin */
++ {
++ polarity = default_ISA_polarity(idx);
++ break;
++ }
++ case MP_BUS_EISA: /* EISA pin */
++ {
++ polarity = default_EISA_polarity(idx);
++ break;
++ }
++ case MP_BUS_PCI: /* PCI pin */
++ {
++ polarity = default_PCI_polarity(idx);
++ break;
++ }
++ case MP_BUS_MCA: /* MCA pin */
++ {
++ polarity = default_MCA_polarity(idx);
++ break;
++ }
++ case MP_BUS_NEC98: /* NEC 98 pin */
++ {
++ polarity = default_NEC98_polarity(idx);
++ break;
++ }
++ default:
++ {
++ printk(KERN_WARNING "broken BIOS!!\n");
++ polarity = 1;
++ break;
++ }
++ }
++ break;
++ }
++ case 1: /* high active */
++ {
++ polarity = 0;
++ break;
++ }
++ case 2: /* reserved */
++ {
++ printk(KERN_WARNING "broken BIOS!!\n");
++ polarity = 1;
++ break;
++ }
++ case 3: /* low active */
++ {
++ polarity = 1;
++ break;
++ }
++ default: /* invalid */
++ {
++ printk(KERN_WARNING "broken BIOS!!\n");
++ polarity = 1;
++ break;
++ }
++ }
++ return polarity;
++}
++
++static int MPBIOS_trigger(int idx)
++{
++ int bus = mp_irqs[idx].mpc_srcbus;
++ int trigger;
++
++ /*
++ * Determine IRQ trigger mode (edge or level sensitive):
++ */
++ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
++ {
++ case 0: /* conforms, ie. bus-type dependent */
++ {
++ switch (mp_bus_id_to_type[bus])
++ {
++ case MP_BUS_ISA: /* ISA pin */
++ {
++ trigger = default_ISA_trigger(idx);
++ break;
++ }
++ case MP_BUS_EISA: /* EISA pin */
++ {
++ trigger = default_EISA_trigger(idx);
++ break;
++ }
++ case MP_BUS_PCI: /* PCI pin */
++ {
++ trigger = default_PCI_trigger(idx);
++ break;
++ }
++ case MP_BUS_MCA: /* MCA pin */
++ {
++ trigger = default_MCA_trigger(idx);
++ break;
++ }
++ case MP_BUS_NEC98: /* NEC 98 pin */
++ {
++ trigger = default_NEC98_trigger(idx);
++ break;
++ }
++ default:
++ {
++ printk(KERN_WARNING "broken BIOS!!\n");
++ trigger = 1;
++ break;
++ }
++ }
++ break;
++ }
++ case 1: /* edge */
++ {
++ trigger = 0;
++ break;
++ }
++ case 2: /* reserved */
++ {
++ printk(KERN_WARNING "broken BIOS!!\n");
++ trigger = 1;
++ break;
++ }
++ case 3: /* level */
++ {
++ trigger = 1;
++ break;
++ }
++ default: /* invalid */
++ {
++ printk(KERN_WARNING "broken BIOS!!\n");
++ trigger = 0;
++ break;
++ }
++ }
++ return trigger;
++}
++
++static inline int irq_polarity(int idx)
++{
++ return MPBIOS_polarity(idx);
++}
++
++static inline int irq_trigger(int idx)
++{
++ return MPBIOS_trigger(idx);
++}
++
++static int pin_2_irq(int idx, int apic, int pin)
++{
++ int irq, i;
++ int bus = mp_irqs[idx].mpc_srcbus;
++
++ /*
++ * Debugging check, we are in big trouble if this message pops up!
++ */
++ if (mp_irqs[idx].mpc_dstirq != pin)
++ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
++
++ switch (mp_bus_id_to_type[bus])
++ {
++ case MP_BUS_ISA: /* ISA pin */
++ case MP_BUS_EISA:
++ case MP_BUS_MCA:
++ case MP_BUS_NEC98:
++ {
++ irq = mp_irqs[idx].mpc_srcbusirq;
++ break;
++ }
++ case MP_BUS_PCI: /* PCI pin */
++ {
++ /*
++ * PCI IRQs are mapped in order
++ */
++ i = irq = 0;
++ while (i < apic)
++ irq += nr_ioapic_registers[i++];
++ irq += pin;
++
++ /*
++ * For MPS mode, so far only needed by ES7000 platform
++ */
++ if (ioapic_renumber_irq)
++ irq = ioapic_renumber_irq(apic, irq);
++
++ break;
++ }
++ default:
++ {
++ printk(KERN_ERR "unknown bus type %d.\n",bus);
++ irq = 0;
++ break;
++ }
++ }
++
++ /*
++ * PCI IRQ command line redirection. Yes, limits are hardcoded.
++ */
++ if ((pin >= 16) && (pin <= 23)) {
++ if (pirq_entries[pin-16] != -1) {
++ if (!pirq_entries[pin-16]) {
++ apic_printk(APIC_VERBOSE, KERN_DEBUG
++ "disabling PIRQ%d\n", pin-16);
++ } else {
++ irq = pirq_entries[pin-16];
++ apic_printk(APIC_VERBOSE, KERN_DEBUG
++ "using PIRQ%d -> IRQ %d\n",
++ pin-16, irq);
++ }
++ }
++ }
++ return irq;
++}
++
++static inline int IO_APIC_irq_trigger(int irq)
++{
++ int apic, idx, pin;
++
++ for (apic = 0; apic < nr_ioapics; apic++) {
++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
++ idx = find_irq_entry(apic,pin,mp_INT);
++ if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
++ return irq_trigger(idx);
++ }
++ }
++ /*
++ * nonexistent IRQs are edge default
++ */
++ return 0;
++}
++
++/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
++u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
++
++int assign_irq_vector(int irq)
++{
++ unsigned long flags;
++ int vector;
++ struct physdev_irq irq_op;
++
++ BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
++
++ spin_lock_irqsave(&vector_lock, flags);
++
++ if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
++ spin_unlock_irqrestore(&vector_lock, flags);
++ return IO_APIC_VECTOR(irq);
++ }
++
++ irq_op.irq = irq;
++ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
++ spin_unlock_irqrestore(&vector_lock, flags);
++ return -ENOSPC;
++ }
++
++ vector = irq_op.vector;
++ vector_irq[vector] = irq;
++ if (irq != AUTO_ASSIGN)
++ IO_APIC_VECTOR(irq) = vector;
++
++ spin_unlock_irqrestore(&vector_lock, flags);
++
++ return vector;
++}
++
++#ifndef CONFIG_XEN
++static struct hw_interrupt_type ioapic_level_type;
++static struct hw_interrupt_type ioapic_edge_type;
++
++#define IOAPIC_AUTO -1
++#define IOAPIC_EDGE 0
++#define IOAPIC_LEVEL 1
++
++static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
++{
++ unsigned idx;
++
++ idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
++
++ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
++ trigger == IOAPIC_LEVEL)
++ irq_desc[idx].chip = &ioapic_level_type;
++ else
++ irq_desc[idx].chip = &ioapic_edge_type;
++ set_intr_gate(vector, interrupt[idx]);
++}
++#else
++#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
++#endif
++
++static void __init setup_IO_APIC_irqs(void)
++{
++ struct IO_APIC_route_entry entry;
++ int apic, pin, idx, irq, first_notcon = 1, vector;
++ unsigned long flags;
++
++ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
++
++ for (apic = 0; apic < nr_ioapics; apic++) {
++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
++
++ /*
++ * add it to the IO-APIC irq-routing table:
++ */
++ memset(&entry,0,sizeof(entry));
++
++ entry.delivery_mode = INT_DELIVERY_MODE;
++ entry.dest_mode = INT_DEST_MODE;
++ entry.mask = 0; /* enable IRQ */
++ entry.dest.logical.logical_dest =
++ cpu_mask_to_apicid(TARGET_CPUS);
++
++ idx = find_irq_entry(apic,pin,mp_INT);
++ if (idx == -1) {
++ if (first_notcon) {
++ apic_printk(APIC_VERBOSE, KERN_DEBUG
++ " IO-APIC (apicid-pin) %d-%d",
++ mp_ioapics[apic].mpc_apicid,
++ pin);
++ first_notcon = 0;
++ } else
++ apic_printk(APIC_VERBOSE, ", %d-%d",
++ mp_ioapics[apic].mpc_apicid, pin);
++ continue;
++ }
++
++ entry.trigger = irq_trigger(idx);
++ entry.polarity = irq_polarity(idx);
++
++ if (irq_trigger(idx)) {
++ entry.trigger = 1;
++ entry.mask = 1;
++ }
++
++ irq = pin_2_irq(idx, apic, pin);
++ /*
++ * skip adding the timer int on secondary nodes, which causes
++ * a small but painful rift in the time-space continuum
++ */
++ if (multi_timer_check(apic, irq))
++ continue;
++ else
++ add_pin_to_irq(irq, apic, pin);
++
++ if (/*!apic &&*/ !IO_APIC_IRQ(irq))
++ continue;
++
++ if (IO_APIC_IRQ(irq)) {
++ vector = assign_irq_vector(irq);
++ entry.vector = vector;
++ ioapic_register_intr(irq, vector, IOAPIC_AUTO);
++
++ if (!apic && (irq < 16))
++ disable_8259A_irq(irq);
++ }
++ spin_lock_irqsave(&ioapic_lock, flags);
++ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
++ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
++ set_native_irq_info(irq, TARGET_CPUS);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++ }
++ }
++
++ if (!first_notcon)
++ apic_printk(APIC_VERBOSE, " not connected.\n");
++}
++
++/*
++ * Set up the 8259A-master output pin:
++ */
++#ifndef CONFIG_XEN
++static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
++{
++ struct IO_APIC_route_entry entry;
++ unsigned long flags;
++
++ memset(&entry,0,sizeof(entry));
++
++ disable_8259A_irq(0);
++
++ /* mask LVT0 */
++ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
++
++ /*
++ * We use logical delivery to get the timer IRQ
++ * to the first CPU.
++ */
++ entry.dest_mode = INT_DEST_MODE;
++ entry.mask = 0; /* unmask IRQ now */
++ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
++ entry.delivery_mode = INT_DELIVERY_MODE;
++ entry.polarity = 0;
++ entry.trigger = 0;
++ entry.vector = vector;
++
++ /*
++ * The timer IRQ doesn't have to know that behind the
++ * scene we have a 8259A-master in AEOI mode ...
++ */
++ irq_desc[0].chip = &ioapic_edge_type;
++
++ /*
++ * Add it to the IO-APIC irq-routing table:
++ */
++ spin_lock_irqsave(&ioapic_lock, flags);
++ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
++ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ enable_8259A_irq(0);
++}
++
++static inline void UNEXPECTED_IO_APIC(void)
++{
++}
++
++void __init print_IO_APIC(void)
++{
++ int apic, i;
++ union IO_APIC_reg_00 reg_00;
++ union IO_APIC_reg_01 reg_01;
++ union IO_APIC_reg_02 reg_02;
++ union IO_APIC_reg_03 reg_03;
++ unsigned long flags;
++
++ if (apic_verbosity == APIC_QUIET)
++ return;
++
++ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
++ for (i = 0; i < nr_ioapics; i++)
++ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
++ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
++
++ /*
++ * We are a bit conservative about what we expect. We have to
++ * know about every hardware change ASAP.
++ */
++ printk(KERN_INFO "testing the IO APIC.......................\n");
++
++ for (apic = 0; apic < nr_ioapics; apic++) {
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ reg_00.raw = io_apic_read(apic, 0);
++ reg_01.raw = io_apic_read(apic, 1);
++ if (reg_01.bits.version >= 0x10)
++ reg_02.raw = io_apic_read(apic, 2);
++ if (reg_01.bits.version >= 0x20)
++ reg_03.raw = io_apic_read(apic, 3);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
++ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
++ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
++ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
++ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
++ if (reg_00.bits.ID >= get_physical_broadcast())
++ UNEXPECTED_IO_APIC();
++ if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
++ UNEXPECTED_IO_APIC();
++
++ printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
++ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
++ if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
++ (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
++ (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
++ (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
++ (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
++ (reg_01.bits.entries != 0x2E) &&
++ (reg_01.bits.entries != 0x3F)
++ )
++ UNEXPECTED_IO_APIC();
++
++ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
++ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
++ if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
++ (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
++ (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
++ (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
++ (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
++ )
++ UNEXPECTED_IO_APIC();
++ if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
++ UNEXPECTED_IO_APIC();
++
++ /*
++ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
++ * but the value of reg_02 is read as the previous read register
++ * value, so ignore it if reg_02 == reg_01.
++ */
++ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
++ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
++ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
++ if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
++ UNEXPECTED_IO_APIC();
++ }
++
++ /*
++ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
++ * or reg_03, but the value of reg_0[23] is read as the previous read
++ * register value, so ignore it if reg_03 == reg_0[12].
++ */
++ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
++ reg_03.raw != reg_01.raw) {
++ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
++ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
++ if (reg_03.bits.__reserved_1)
++ UNEXPECTED_IO_APIC();
++ }
++
++ printk(KERN_DEBUG ".... IRQ redirection table:\n");
++
++ printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
++ " Stat Dest Deli Vect: \n");
++
++ for (i = 0; i <= reg_01.bits.entries; i++) {
++ struct IO_APIC_route_entry entry;
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
++ *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ printk(KERN_DEBUG " %02x %03X %02X ",
++ i,
++ entry.dest.logical.logical_dest,
++ entry.dest.physical.physical_dest
++ );
++
++ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
++ entry.mask,
++ entry.trigger,
++ entry.irr,
++ entry.polarity,
++ entry.delivery_status,
++ entry.dest_mode,
++ entry.delivery_mode,
++ entry.vector
++ );
++ }
++ }
++ if (use_pci_vector())
++ printk(KERN_INFO "Using vector-based indexing\n");
++ printk(KERN_DEBUG "IRQ to pin mappings:\n");
++ for (i = 0; i < NR_IRQS; i++) {
++ struct irq_pin_list *entry = irq_2_pin + i;
++ if (entry->pin < 0)
++ continue;
++ if (use_pci_vector() && !platform_legacy_irq(i))
++ printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
++ else
++ printk(KERN_DEBUG "IRQ%d ", i);
++ for (;;) {
++ printk("-> %d:%d", entry->apic, entry->pin);
++ if (!entry->next)
++ break;
++ entry = irq_2_pin + entry->next;
++ }
++ printk("\n");
++ }
++
++ printk(KERN_INFO ".................................... done.\n");
++
++ return;
++}
++
++#if 0
++
++static void print_APIC_bitfield (int base)
++{
++ unsigned int v;
++ int i, j;
++
++ if (apic_verbosity == APIC_QUIET)
++ return;
++
++ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
++ for (i = 0; i < 8; i++) {
++ v = apic_read(base + i*0x10);
++ for (j = 0; j < 32; j++) {
++ if (v & (1<<j))
++ printk("1");
++ else
++ printk("0");
++ }
++ printk("\n");
++ }
++}
++
++void /*__init*/ print_local_APIC(void * dummy)
++{
++ unsigned int v, ver, maxlvt;
++
++ if (apic_verbosity == APIC_QUIET)
++ return;
++
++ printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
++ smp_processor_id(), hard_smp_processor_id());
++ v = apic_read(APIC_ID);
++ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
++ v = apic_read(APIC_LVR);
++ printk(KERN_INFO "... APIC VERSION: %08x\n", v);
++ ver = GET_APIC_VERSION(v);
++ maxlvt = get_maxlvt();
++
++ v = apic_read(APIC_TASKPRI);
++ printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
++
++ if (APIC_INTEGRATED(ver)) { /* !82489DX */
++ v = apic_read(APIC_ARBPRI);
++ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
++ v & APIC_ARBPRI_MASK);
++ v = apic_read(APIC_PROCPRI);
++ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
++ }
++
++ v = apic_read(APIC_EOI);
++ printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
++ v = apic_read(APIC_RRR);
++ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
++ v = apic_read(APIC_LDR);
++ printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
++ v = apic_read(APIC_DFR);
++ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
++ v = apic_read(APIC_SPIV);
++ printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
++
++ printk(KERN_DEBUG "... APIC ISR field:\n");
++ print_APIC_bitfield(APIC_ISR);
++ printk(KERN_DEBUG "... APIC TMR field:\n");
++ print_APIC_bitfield(APIC_TMR);
++ printk(KERN_DEBUG "... APIC IRR field:\n");
++ print_APIC_bitfield(APIC_IRR);
++
++ if (APIC_INTEGRATED(ver)) { /* !82489DX */
++ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
++ apic_write(APIC_ESR, 0);
++ v = apic_read(APIC_ESR);
++ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
++ }
++
++ v = apic_read(APIC_ICR);
++ printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
++ v = apic_read(APIC_ICR2);
++ printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
++
++ v = apic_read(APIC_LVTT);
++ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
++
++ if (maxlvt > 3) { /* PC is LVT#4. */
++ v = apic_read(APIC_LVTPC);
++ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
++ }
++ v = apic_read(APIC_LVT0);
++ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
++ v = apic_read(APIC_LVT1);
++ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
++
++ if (maxlvt > 2) { /* ERR is LVT#3. */
++ v = apic_read(APIC_LVTERR);
++ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
++ }
++
++ v = apic_read(APIC_TMICT);
++ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
++ v = apic_read(APIC_TMCCT);
++ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
++ v = apic_read(APIC_TDCR);
++ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
++ printk("\n");
++}
++
++void print_all_local_APICs (void)
++{
++ on_each_cpu(print_local_APIC, NULL, 1, 1);
++}
++
++void /*__init*/ print_PIC(void)
++{
++ unsigned int v;
++ unsigned long flags;
++
++ if (apic_verbosity == APIC_QUIET)
++ return;
++
++ printk(KERN_DEBUG "\nprinting PIC contents\n");
++
++ spin_lock_irqsave(&i8259A_lock, flags);
++
++ v = inb(0xa1) << 8 | inb(0x21);
++ printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
++
++ v = inb(0xa0) << 8 | inb(0x20);
++ printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
++
++ outb(0x0b,0xa0);
++ outb(0x0b,0x20);
++ v = inb(0xa0) << 8 | inb(0x20);
++ outb(0x0a,0xa0);
++ outb(0x0a,0x20);
++
++ spin_unlock_irqrestore(&i8259A_lock, flags);
++
++ printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
++
++ v = inb(0x4d1) << 8 | inb(0x4d0);
++ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
++}
++
++#endif /* 0 */
++
++#else
++void __init print_IO_APIC(void) { }
++#endif /* !CONFIG_XEN */
++
++static void __init enable_IO_APIC(void)
++{
++ union IO_APIC_reg_01 reg_01;
++ int i8259_apic, i8259_pin;
++ int i, apic;
++ unsigned long flags;
++
++ for (i = 0; i < PIN_MAP_SIZE; i++) {
++ irq_2_pin[i].pin = -1;
++ irq_2_pin[i].next = 0;
++ }
++ if (!pirqs_enabled)
++ for (i = 0; i < MAX_PIRQS; i++)
++ pirq_entries[i] = -1;
++
++ /*
++ * The number of IO-APIC IRQ registers (== #pins):
++ */
++ for (apic = 0; apic < nr_ioapics; apic++) {
++ spin_lock_irqsave(&ioapic_lock, flags);
++ reg_01.raw = io_apic_read(apic, 1);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++ nr_ioapic_registers[apic] = reg_01.bits.entries+1;
++ }
++ for(apic = 0; apic < nr_ioapics; apic++) {
++ int pin;
++ /* See if any of the pins is in ExtINT mode */
++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
++ struct IO_APIC_route_entry entry;
++ spin_lock_irqsave(&ioapic_lock, flags);
++ *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
++ *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++
++ /* If the interrupt line is enabled and in ExtInt mode
++ * I have found the pin where the i8259 is connected.
++ */
++ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
++ ioapic_i8259.apic = apic;
++ ioapic_i8259.pin = pin;
++ goto found_i8259;
++ }
++ }
++ }
++ found_i8259:
++ /* Look to see what if the MP table has reported the ExtINT */
++ /* If we could not find the appropriate pin by looking at the ioapic
++ * the i8259 probably is not connected the ioapic but give the
++ * mptable a chance anyway.
++ */
++ i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
++ i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
++ /* Trust the MP table if nothing is setup in the hardware */
++ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
++ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
++ ioapic_i8259.pin = i8259_pin;
++ ioapic_i8259.apic = i8259_apic;
++ }
++ /* Complain if the MP table and the hardware disagree */
++ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
++ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
++ {
++ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
++ }
++
++ /*
++ * Do not trust the IO-APIC being empty at bootup
++ */
++ clear_IO_APIC();
++}
++
++/*
++ * Not an __init, needed by the reboot code
++ */
++void disable_IO_APIC(void)
++{
++ /*
++ * Clear the IO-APIC before rebooting:
++ */
++ clear_IO_APIC();
++
++#ifndef CONFIG_XEN
++ /*
++ * If the i8259 is routed through an IOAPIC
++ * Put that IOAPIC in virtual wire mode
++ * so legacy interrupts can be delivered.
++ */
++ if (ioapic_i8259.pin != -1) {
++ struct IO_APIC_route_entry entry;
++ unsigned long flags;
++
++ memset(&entry, 0, sizeof(entry));
++ entry.mask = 0; /* Enabled */
++ entry.trigger = 0; /* Edge */
++ entry.irr = 0;
++ entry.polarity = 0; /* High */
++ entry.delivery_status = 0;
++ entry.dest_mode = 0; /* Physical */
++ entry.delivery_mode = dest_ExtINT; /* ExtInt */
++ entry.vector = 0;
++ entry.dest.physical.physical_dest =
++ GET_APIC_ID(apic_read(APIC_ID));
++
++ /*
++ * Add it to the IO-APIC irq-routing table:
++ */
++ spin_lock_irqsave(&ioapic_lock, flags);
++ io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
++ *(((int *)&entry)+1));
++ io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
++ *(((int *)&entry)+0));
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++ }
++ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
++#endif
++}
++
++/*
++ * function to set the IO-APIC physical IDs based on the
++ * values stored in the MPC table.
++ *
++ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
++ */
++
++#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
++static void __init setup_ioapic_ids_from_mpc(void)
++{
++ union IO_APIC_reg_00 reg_00;
++ physid_mask_t phys_id_present_map;
++ int apic;
++ int i;
++ unsigned char old_id;
++ unsigned long flags;
++
++ /*
++ * Don't check I/O APIC IDs for xAPIC systems. They have
++ * no meaning without the serial APIC bus.
++ */
++ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
++ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
++ return;
++ /*
++ * This is broken; anything with a real cpu count has to
++ * circumvent this idiocy regardless.
++ */
++ phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
++
++ /*
++ * Set the IOAPIC ID to the value stored in the MPC table.
++ */
++ for (apic = 0; apic < nr_ioapics; apic++) {
++
++ /* Read the register 0 value */
++ spin_lock_irqsave(&ioapic_lock, flags);
++ reg_00.raw = io_apic_read(apic, 0);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ old_id = mp_ioapics[apic].mpc_apicid;
++
++ if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
++ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
++ apic, mp_ioapics[apic].mpc_apicid);
++ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
++ reg_00.bits.ID);
++ mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
++ }
++
++ /*
++ * Sanity check, is the ID really free? Every APIC in a
++ * system must have a unique ID or we get lots of nice
++ * 'stuck on smp_invalidate_needed IPI wait' messages.
++ */
++ if (check_apicid_used(phys_id_present_map,
++ mp_ioapics[apic].mpc_apicid)) {
++ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
++ apic, mp_ioapics[apic].mpc_apicid);
++ for (i = 0; i < get_physical_broadcast(); i++)
++ if (!physid_isset(i, phys_id_present_map))
++ break;
++ if (i >= get_physical_broadcast())
++ panic("Max APIC ID exceeded!\n");
++ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
++ i);
++ physid_set(i, phys_id_present_map);
++ mp_ioapics[apic].mpc_apicid = i;
++ } else {
++ physid_mask_t tmp;
++ tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
++ apic_printk(APIC_VERBOSE, "Setting %d in the "
++ "phys_id_present_map\n",
++ mp_ioapics[apic].mpc_apicid);
++ physids_or(phys_id_present_map, phys_id_present_map, tmp);
++ }
++
++
++ /*
++ * We need to adjust the IRQ routing table
++ * if the ID changed.
++ */
++ if (old_id != mp_ioapics[apic].mpc_apicid)
++ for (i = 0; i < mp_irq_entries; i++)
++ if (mp_irqs[i].mpc_dstapic == old_id)
++ mp_irqs[i].mpc_dstapic
++ = mp_ioapics[apic].mpc_apicid;
++
++ /*
++ * Read the right value from the MPC table and
++ * write it into the ID register.
++ */
++ apic_printk(APIC_VERBOSE, KERN_INFO
++ "...changing IO-APIC physical APIC ID to %d ...",
++ mp_ioapics[apic].mpc_apicid);
++
++ reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
++ spin_lock_irqsave(&ioapic_lock, flags);
++ io_apic_write(apic, 0, reg_00.raw);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ /*
++ * Sanity check
++ */
++ spin_lock_irqsave(&ioapic_lock, flags);
++ reg_00.raw = io_apic_read(apic, 0);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++ if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
++ printk("could not set ID!\n");
++ else
++ apic_printk(APIC_VERBOSE, " ok.\n");
++ }
++}
++#else
++static void __init setup_ioapic_ids_from_mpc(void) { }
++#endif
++
++#ifndef CONFIG_XEN
++/*
++ * There is a nasty bug in some older SMP boards, their mptable lies
++ * about the timer IRQ. We do the following to work around the situation:
++ *
++ * - timer IRQ defaults to IO-APIC IRQ
++ * - if this function detects that timer IRQs are defunct, then we fall
++ * back to ISA timer IRQs
++ */
++static int __init timer_irq_works(void)
++{
++ unsigned long t1 = jiffies;
++
++ local_irq_enable();
++ /* Let ten ticks pass... */
++ mdelay((10 * 1000) / HZ);
++
++ /*
++ * Expect a few ticks at least, to be sure some possible
++ * glue logic does not lock up after one or two first
++ * ticks in a non-ExtINT mode. Also the local APIC
++ * might have cached one ExtINT interrupt. Finally, at
++ * least one tick may be lost due to delays.
++ */
++ if (jiffies - t1 > 4)
++ return 1;
++
++ return 0;
++}
++
++/*
++ * In the SMP+IOAPIC case it might happen that there are an unspecified
++ * number of pending IRQ events unhandled. These cases are very rare,
++ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
++ * better to do it this way as thus we do not have to be aware of
++ * 'pending' interrupts in the IRQ path, except at this point.
++ */
++/*
++ * Edge triggered needs to resend any interrupt
++ * that was delayed but this is now handled in the device
++ * independent code.
++ */
++
++/*
++ * Starting up a edge-triggered IO-APIC interrupt is
++ * nasty - we need to make sure that we get the edge.
++ * If it is already asserted for some reason, we need
++ * return 1 to indicate that is was pending.
++ *
++ * This is not complete - we should be able to fake
++ * an edge even if it isn't on the 8259A...
++ */
++static unsigned int startup_edge_ioapic_irq(unsigned int irq)
++{
++ int was_pending = 0;
++ unsigned long flags;
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ if (irq < 16) {
++ disable_8259A_irq(irq);
++ if (i8259A_irq_pending(irq))
++ was_pending = 1;
++ }
++ __unmask_IO_APIC_irq(irq);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ return was_pending;
++}
++
++/*
++ * Once we have recorded IRQ_PENDING already, we can mask the
++ * interrupt for real. This prevents IRQ storms from unhandled
++ * devices.
++ */
++static void ack_edge_ioapic_irq(unsigned int irq)
++{
++ move_irq(irq);
++ if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
++ == (IRQ_PENDING | IRQ_DISABLED))
++ mask_IO_APIC_irq(irq);
++ ack_APIC_irq();
++}
++
++/*
++ * Level triggered interrupts can just be masked,
++ * and shutting down and starting up the interrupt
++ * is the same as enabling and disabling them -- except
++ * with a startup need to return a "was pending" value.
++ *
++ * Level triggered interrupts are special because we
++ * do not touch any IO-APIC register while handling
++ * them. We ack the APIC in the end-IRQ handler, not
++ * in the start-IRQ-handler. Protection against reentrance
++ * from the same interrupt is still provided, both by the
++ * generic IRQ layer and by the fact that an unacked local
++ * APIC does not accept IRQs.
++ */
++static unsigned int startup_level_ioapic_irq (unsigned int irq)
++{
++ unmask_IO_APIC_irq(irq);
++
++ return 0; /* don't check for pending */
++}
++
++static void end_level_ioapic_irq (unsigned int irq)
++{
++ unsigned long v;
++ int i;
++
++ move_irq(irq);
++/*
++ * It appears there is an erratum which affects at least version 0x11
++ * of I/O APIC (that's the 82093AA and cores integrated into various
++ * chipsets). Under certain conditions a level-triggered interrupt is
++ * erroneously delivered as edge-triggered one but the respective IRR
++ * bit gets set nevertheless. As a result the I/O unit expects an EOI
++ * message but it will never arrive and further interrupts are blocked
++ * from the source. The exact reason is so far unknown, but the
++ * phenomenon was observed when two consecutive interrupt requests
++ * from a given source get delivered to the same CPU and the source is
++ * temporarily disabled in between.
++ *
++ * A workaround is to simulate an EOI message manually. We achieve it
++ * by setting the trigger mode to edge and then to level when the edge
++ * trigger mode gets detected in the TMR of a local APIC for a
++ * level-triggered interrupt. We mask the source for the time of the
++ * operation to prevent an edge-triggered interrupt escaping meanwhile.
++ * The idea is from Manfred Spraul. --macro
++ */
++ i = IO_APIC_VECTOR(irq);
++
++ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
++
++ ack_APIC_irq();
++
++ if (!(v & (1 << (i & 0x1f)))) {
++ atomic_inc(&irq_mis_count);
++ spin_lock(&ioapic_lock);
++ __mask_and_edge_IO_APIC_irq(irq);
++ __unmask_and_level_IO_APIC_irq(irq);
++ spin_unlock(&ioapic_lock);
++ }
++}
++
++#ifdef CONFIG_PCI_MSI
++static unsigned int startup_edge_ioapic_vector(unsigned int vector)
++{
++ int irq = vector_to_irq(vector);
++
++ return startup_edge_ioapic_irq(irq);
++}
++
++static void ack_edge_ioapic_vector(unsigned int vector)
++{
++ int irq = vector_to_irq(vector);
++
++ move_native_irq(vector);
++ ack_edge_ioapic_irq(irq);
++}
++
++static unsigned int startup_level_ioapic_vector (unsigned int vector)
++{
++ int irq = vector_to_irq(vector);
++
++ return startup_level_ioapic_irq (irq);
++}
++
++static void end_level_ioapic_vector (unsigned int vector)
++{
++ int irq = vector_to_irq(vector);
++
++ move_native_irq(vector);
++ end_level_ioapic_irq(irq);
++}
++
++static void mask_IO_APIC_vector (unsigned int vector)
++{
++ int irq = vector_to_irq(vector);
++
++ mask_IO_APIC_irq(irq);
++}
++
++static void unmask_IO_APIC_vector (unsigned int vector)
++{
++ int irq = vector_to_irq(vector);
++
++ unmask_IO_APIC_irq(irq);
++}
++
++#ifdef CONFIG_SMP
++static void set_ioapic_affinity_vector (unsigned int vector,
++ cpumask_t cpu_mask)
++{
++ int irq = vector_to_irq(vector);
++
++ set_native_irq_info(vector, cpu_mask);
++ set_ioapic_affinity_irq(irq, cpu_mask);
++}
++#endif
++#endif
++
++static int ioapic_retrigger(unsigned int irq)
++{
++ send_IPI_self(IO_APIC_VECTOR(irq));
++
++ return 1;
++}
++
++/*
++ * Level and edge triggered IO-APIC interrupts need different handling,
++ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
++ * handled with the level-triggered descriptor, but that one has slightly
++ * more overhead. Level-triggered interrupts cannot be handled with the
++ * edge-triggered handler, without risking IRQ storms and other ugly
++ * races.
++ */
++static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
++ .typename = "IO-APIC-edge",
++ .startup = startup_edge_ioapic,
++ .shutdown = shutdown_edge_ioapic,
++ .enable = enable_edge_ioapic,
++ .disable = disable_edge_ioapic,
++ .ack = ack_edge_ioapic,
++ .end = end_edge_ioapic,
++#ifdef CONFIG_SMP
++ .set_affinity = set_ioapic_affinity,
++#endif
++ .retrigger = ioapic_retrigger,
++};
++
++static struct hw_interrupt_type ioapic_level_type __read_mostly = {
++ .typename = "IO-APIC-level",
++ .startup = startup_level_ioapic,
++ .shutdown = shutdown_level_ioapic,
++ .enable = enable_level_ioapic,
++ .disable = disable_level_ioapic,
++ .ack = mask_and_ack_level_ioapic,
++ .end = end_level_ioapic,
++#ifdef CONFIG_SMP
++ .set_affinity = set_ioapic_affinity,
++#endif
++ .retrigger = ioapic_retrigger,
++};
++#endif /* !CONFIG_XEN */
++
++static inline void init_IO_APIC_traps(void)
++{
++ int irq;
++
++ /*
++ * NOTE! The local APIC isn't very good at handling
++ * multiple interrupts at the same interrupt level.
++ * As the interrupt level is determined by taking the
++ * vector number and shifting that right by 4, we
++ * want to spread these out a bit so that they don't
++ * all fall in the same interrupt level.
++ *
++ * Also, we've got to be careful not to trash gate
++ * 0x80, because int 0x80 is hm, kind of importantish. ;)
++ */
++ for (irq = 0; irq < NR_IRQS ; irq++) {
++ int tmp = irq;
++ if (use_pci_vector()) {
++ if (!platform_legacy_irq(tmp))
++ if ((tmp = vector_to_irq(tmp)) == -1)
++ continue;
++ }
++ if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
++ /*
++ * Hmm.. We don't have an entry for this,
++ * so default to an old-fashioned 8259
++ * interrupt if we can..
++ */
++ if (irq < 16)
++ make_8259A_irq(irq);
++#ifndef CONFIG_XEN
++ else
++ /* Strange. Oh, well.. */
++ irq_desc[irq].chip = &no_irq_type;
++#endif
++ }
++ }
++}
++
++#ifndef CONFIG_XEN
++static void enable_lapic_irq (unsigned int irq)
++{
++ unsigned long v;
++
++ v = apic_read(APIC_LVT0);
++ apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
++}
++
++static void disable_lapic_irq (unsigned int irq)
++{
++ unsigned long v;
++
++ v = apic_read(APIC_LVT0);
++ apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
++}
++
++static void ack_lapic_irq (unsigned int irq)
++{
++ ack_APIC_irq();
++}
++
++static void end_lapic_irq (unsigned int i) { /* nothing */ }
++
++static struct hw_interrupt_type lapic_irq_type __read_mostly = {
++ .typename = "local-APIC-edge",
++ .startup = NULL, /* startup_irq() not used for IRQ0 */
++ .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
++ .enable = enable_lapic_irq,
++ .disable = disable_lapic_irq,
++ .ack = ack_lapic_irq,
++ .end = end_lapic_irq
++};
++
++static void setup_nmi (void)
++{
++ /*
++ * Dirty trick to enable the NMI watchdog ...
++ * We put the 8259A master into AEOI mode and
++ * unmask on all local APICs LVT0 as NMI.
++ *
++ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
++ * is from Maciej W. Rozycki - so we do not have to EOI from
++ * the NMI handler or the timer interrupt.
++ */
++ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
++
++ on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
++
++ apic_printk(APIC_VERBOSE, " done.\n");
++}
++
++/*
++ * This looks a bit hackish but it's about the only one way of sending
++ * a few INTA cycles to 8259As and any associated glue logic. ICR does
++ * not support the ExtINT mode, unfortunately. We need to send these
++ * cycles as some i82489DX-based boards have glue logic that keeps the
++ * 8259A interrupt line asserted until INTA. --macro
++ */
++static inline void unlock_ExtINT_logic(void)
++{
++ int apic, pin, i;
++ struct IO_APIC_route_entry entry0, entry1;
++ unsigned char save_control, save_freq_select;
++ unsigned long flags;
++
++ pin = find_isa_irq_pin(8, mp_INT);
++ apic = find_isa_irq_apic(8, mp_INT);
++ if (pin == -1)
++ return;
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
++ *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++ clear_IO_APIC_pin(apic, pin);
++
++ memset(&entry1, 0, sizeof(entry1));
++
++ entry1.dest_mode = 0; /* physical delivery */
++ entry1.mask = 0; /* unmask IRQ now */
++ entry1.dest.physical.physical_dest = hard_smp_processor_id();
++ entry1.delivery_mode = dest_ExtINT;
++ entry1.polarity = entry0.polarity;
++ entry1.trigger = 0;
++ entry1.vector = 0;
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
++ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ save_control = CMOS_READ(RTC_CONTROL);
++ save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
++ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
++ RTC_FREQ_SELECT);
++ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
++
++ i = 100;
++ while (i-- > 0) {
++ mdelay(10);
++ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
++ i -= 10;
++ }
++
++ CMOS_WRITE(save_control, RTC_CONTROL);
++ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
++ clear_IO_APIC_pin(apic, pin);
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
++ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++}
++
++int timer_uses_ioapic_pin_0;
++
++/*
++ * This code may look a bit paranoid, but it's supposed to cooperate with
++ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
++ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
++ * fanatically on his truly buggy board.
++ */
++static inline void check_timer(void)
++{
++ int apic1, pin1, apic2, pin2;
++ int vector;
++
++ /*
++ * get/set the timer IRQ vector:
++ */
++ disable_8259A_irq(0);
++ vector = assign_irq_vector(0);
++ set_intr_gate(vector, interrupt[0]);
++
++ /*
++ * Subtle, code in do_timer_interrupt() expects an AEOI
++ * mode for the 8259A whenever interrupts are routed
++ * through I/O APICs. Also IRQ0 has to be enabled in
++ * the 8259A which implies the virtual wire has to be
++ * disabled in the local APIC.
++ */
++ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
++ init_8259A(1);
++ timer_ack = 1;
++ if (timer_over_8254 > 0)
++ enable_8259A_irq(0);
++
++ pin1 = find_isa_irq_pin(0, mp_INT);
++ apic1 = find_isa_irq_apic(0, mp_INT);
++ pin2 = ioapic_i8259.pin;
++ apic2 = ioapic_i8259.apic;
++
++ if (pin1 == 0)
++ timer_uses_ioapic_pin_0 = 1;
++
++ printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
++ vector, apic1, pin1, apic2, pin2);
++
++ if (pin1 != -1) {
++ /*
++ * Ok, does IRQ0 through the IOAPIC work?
++ */
++ unmask_IO_APIC_irq(0);
++ if (timer_irq_works()) {
++ if (nmi_watchdog == NMI_IO_APIC) {
++ disable_8259A_irq(0);
++ setup_nmi();
++ enable_8259A_irq(0);
++ }
++ if (disable_timer_pin_1 > 0)
++ clear_IO_APIC_pin(0, pin1);
++ return;
++ }
++ clear_IO_APIC_pin(apic1, pin1);
++ printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
++ "IO-APIC\n");
++ }
++
++ printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
++ if (pin2 != -1) {
++ printk("\n..... (found pin %d) ...", pin2);
++ /*
++ * legacy devices should be connected to IO APIC #0
++ */
++ setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
++ if (timer_irq_works()) {
++ printk("works.\n");
++ if (pin1 != -1)
++ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
++ else
++ add_pin_to_irq(0, apic2, pin2);
++ if (nmi_watchdog == NMI_IO_APIC) {
++ setup_nmi();
++ }
++ return;
++ }
++ /*
++ * Cleanup, just in case ...
++ */
++ clear_IO_APIC_pin(apic2, pin2);
++ }
++ printk(" failed.\n");
++
++ if (nmi_watchdog == NMI_IO_APIC) {
++ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
++ nmi_watchdog = 0;
++ }
++
++ printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
++
++ disable_8259A_irq(0);
++ irq_desc[0].chip = &lapic_irq_type;
++ apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
++ enable_8259A_irq(0);
++
++ if (timer_irq_works()) {
++ printk(" works.\n");
++ return;
++ }
++ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
++ printk(" failed.\n");
++
++ printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
++
++ timer_ack = 0;
++ init_8259A(0);
++ make_8259A_irq(0);
++ apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
++
++ unlock_ExtINT_logic();
++
++ if (timer_irq_works()) {
++ printk(" works.\n");
++ return;
++ }
++ printk(" failed :(.\n");
++ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
++ "report. Then try booting with the 'noapic' option");
++}
++#else
++int timer_uses_ioapic_pin_0 = 0;
++#define check_timer() ((void)0)
++#endif
++
++/*
++ *
++ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
++ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
++ * Linux doesn't really care, as it's not actually used
++ * for any interrupt handling anyway.
++ */
++#define PIC_IRQS (1 << PIC_CASCADE_IR)
++
++void __init setup_IO_APIC(void)
++{
++ enable_IO_APIC();
++
++ if (acpi_ioapic)
++ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
++ else
++ io_apic_irqs = ~PIC_IRQS;
++
++ printk("ENABLING IO-APIC IRQs\n");
++
++ /*
++ * Set up IO-APIC IRQ routing.
++ */
++ if (!acpi_ioapic)
++ setup_ioapic_ids_from_mpc();
++#ifndef CONFIG_XEN
++ sync_Arb_IDs();
++#endif
++ setup_IO_APIC_irqs();
++ init_IO_APIC_traps();
++ check_timer();
++ if (!acpi_ioapic)
++ print_IO_APIC();
++}
++
++static int __init setup_disable_8254_timer(char *s)
++{
++ timer_over_8254 = -1;
++ return 1;
++}
++static int __init setup_enable_8254_timer(char *s)
++{
++ timer_over_8254 = 2;
++ return 1;
++}
++
++__setup("disable_8254_timer", setup_disable_8254_timer);
++__setup("enable_8254_timer", setup_enable_8254_timer);
++
++/*
++ * Called after all the initialization is done. If we didnt find any
++ * APIC bugs then we can allow the modify fast path
++ */
++
++static int __init io_apic_bug_finalize(void)
++{
++ if(sis_apic_bug == -1)
++ sis_apic_bug = 0;
++ if (is_initial_xendomain()) {
++ struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
++ op.u.platform_quirk.quirk_id = sis_apic_bug ?
++ QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
++ HYPERVISOR_platform_op(&op);
++ }
++ return 0;
++}
++
++late_initcall(io_apic_bug_finalize);
++
++struct sysfs_ioapic_data {
++ struct sys_device dev;
++ struct IO_APIC_route_entry entry[0];
++};
++static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
++
++static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
++{
++ struct IO_APIC_route_entry *entry;
++ struct sysfs_ioapic_data *data;
++ unsigned long flags;
++ int i;
++
++ data = container_of(dev, struct sysfs_ioapic_data, dev);
++ entry = data->entry;
++ spin_lock_irqsave(&ioapic_lock, flags);
++ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
++ *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
++ *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
++ }
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ return 0;
++}
++
++static int ioapic_resume(struct sys_device *dev)
++{
++ struct IO_APIC_route_entry *entry;
++ struct sysfs_ioapic_data *data;
++ unsigned long flags;
++ union IO_APIC_reg_00 reg_00;
++ int i;
++
++ data = container_of(dev, struct sysfs_ioapic_data, dev);
++ entry = data->entry;
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ reg_00.raw = io_apic_read(dev->id, 0);
++ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
++ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
++ io_apic_write(dev->id, 0, reg_00.raw);
++ }
++ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
++ io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
++ io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
++ }
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ return 0;
++}
++
++static struct sysdev_class ioapic_sysdev_class = {
++ set_kset_name("ioapic"),
++ .suspend = ioapic_suspend,
++ .resume = ioapic_resume,
++};
++
++static int __init ioapic_init_sysfs(void)
++{
++ struct sys_device * dev;
++ int i, size, error = 0;
++
++ error = sysdev_class_register(&ioapic_sysdev_class);
++ if (error)
++ return error;
++
++ for (i = 0; i < nr_ioapics; i++ ) {
++ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
++ * sizeof(struct IO_APIC_route_entry);
++ mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
++ if (!mp_ioapic_data[i]) {
++ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
++ continue;
++ }
++ memset(mp_ioapic_data[i], 0, size);
++ dev = &mp_ioapic_data[i]->dev;
++ dev->id = i;
++ dev->cls = &ioapic_sysdev_class;
++ error = sysdev_register(dev);
++ if (error) {
++ kfree(mp_ioapic_data[i]);
++ mp_ioapic_data[i] = NULL;
++ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
++ continue;
++ }
++ }
++
++ return 0;
++}
++
++device_initcall(ioapic_init_sysfs);
++
++/* --------------------------------------------------------------------------
++ ACPI-based IOAPIC Configuration
++ -------------------------------------------------------------------------- */
++
++#ifdef CONFIG_ACPI
++
++int __init io_apic_get_unique_id (int ioapic, int apic_id)
++{
++#ifndef CONFIG_XEN
++ union IO_APIC_reg_00 reg_00;
++ static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
++ physid_mask_t tmp;
++ unsigned long flags;
++ int i = 0;
++
++ /*
++ * The P4 platform supports up to 256 APIC IDs on two separate APIC
++ * buses (one for LAPICs, one for IOAPICs), where predecessors only
++ * supports up to 16 on one shared APIC bus.
++ *
++ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
++ * advantage of new APIC bus architecture.
++ */
++
++ if (physids_empty(apic_id_map))
++ apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ reg_00.raw = io_apic_read(ioapic, 0);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ if (apic_id >= get_physical_broadcast()) {
++ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
++ "%d\n", ioapic, apic_id, reg_00.bits.ID);
++ apic_id = reg_00.bits.ID;
++ }
++
++ /*
++ * Every APIC in a system must have a unique ID or we get lots of nice
++ * 'stuck on smp_invalidate_needed IPI wait' messages.
++ */
++ if (check_apicid_used(apic_id_map, apic_id)) {
++
++ for (i = 0; i < get_physical_broadcast(); i++) {
++ if (!check_apicid_used(apic_id_map, i))
++ break;
++ }
++
++ if (i == get_physical_broadcast())
++ panic("Max apic_id exceeded!\n");
++
++ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
++ "trying %d\n", ioapic, apic_id, i);
++
++ apic_id = i;
++ }
++
++ tmp = apicid_to_cpu_present(apic_id);
++ physids_or(apic_id_map, apic_id_map, tmp);
++
++ if (reg_00.bits.ID != apic_id) {
++ reg_00.bits.ID = apic_id;
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ io_apic_write(ioapic, 0, reg_00.raw);
++ reg_00.raw = io_apic_read(ioapic, 0);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ /* Sanity check */
++ if (reg_00.bits.ID != apic_id) {
++ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
++ return -1;
++ }
++ }
++
++ apic_printk(APIC_VERBOSE, KERN_INFO
++ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
++#endif /* !CONFIG_XEN */
++
++ return apic_id;
++}
++
++
++int __init io_apic_get_version (int ioapic)
++{
++ union IO_APIC_reg_01 reg_01;
++ unsigned long flags;
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ reg_01.raw = io_apic_read(ioapic, 1);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ return reg_01.bits.version;
++}
++
++
++int __init io_apic_get_redir_entries (int ioapic)
++{
++ union IO_APIC_reg_01 reg_01;
++ unsigned long flags;
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ reg_01.raw = io_apic_read(ioapic, 1);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ return reg_01.bits.entries;
++}
++
++
++int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
++{
++ struct IO_APIC_route_entry entry;
++ unsigned long flags;
++
++ if (!IO_APIC_IRQ(irq)) {
++ printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
++ ioapic);
++ return -EINVAL;
++ }
++
++ /*
++ * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
++ * Note that we mask (disable) IRQs now -- these get enabled when the
++ * corresponding device driver registers for this IRQ.
++ */
++
++ memset(&entry,0,sizeof(entry));
++
++ entry.delivery_mode = INT_DELIVERY_MODE;
++ entry.dest_mode = INT_DEST_MODE;
++ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
++ entry.trigger = edge_level;
++ entry.polarity = active_high_low;
++ entry.mask = 1;
++
++ /*
++ * IRQs < 16 are already in the irq_2_pin[] map
++ */
++ if (irq >= 16)
++ add_pin_to_irq(irq, ioapic, pin);
++
++ entry.vector = assign_irq_vector(irq);
++
++ apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
++ "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
++ mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
++ edge_level, active_high_low);
++
++ ioapic_register_intr(irq, entry.vector, edge_level);
++
++ if (!ioapic && (irq < 16))
++ disable_8259A_irq(irq);
++
++ spin_lock_irqsave(&ioapic_lock, flags);
++ io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
++ io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
++ set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
++ spin_unlock_irqrestore(&ioapic_lock, flags);
++
++ return 0;
++}
++
++#endif /* CONFIG_ACPI */
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/ioport-xen.c 2007-08-27 14:01:58.000000000 -0400
+@@ -0,0 +1,122 @@
++/*
++ * linux/arch/i386/kernel/ioport.c
++ *
++ * This contains the io-permission bitmap code - written by obz, with changes
++ * by Linus.
++ */
++
++#include <linux/sched.h>
++#include <linux/kernel.h>
++#include <linux/capability.h>
++#include <linux/errno.h>
++#include <linux/types.h>
++#include <linux/ioport.h>
++#include <linux/smp.h>
++#include <linux/smp_lock.h>
++#include <linux/stddef.h>
++#include <linux/slab.h>
++#include <linux/thread_info.h>
++#include <xen/interface/physdev.h>
++
++/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
++static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
++{
++ unsigned long mask;
++ unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
++ unsigned int low_index = base & (BITS_PER_LONG-1);
++ int length = low_index + extent;
++
++ if (low_index != 0) {
++ mask = (~0UL << low_index);
++ if (length < BITS_PER_LONG)
++ mask &= ~(~0UL << length);
++ if (new_value)
++ *bitmap_base++ |= mask;
++ else
++ *bitmap_base++ &= ~mask;
++ length -= BITS_PER_LONG;
++ }
++
++ mask = (new_value ? ~0UL : 0UL);
++ while (length >= BITS_PER_LONG) {
++ *bitmap_base++ = mask;
++ length -= BITS_PER_LONG;
++ }
++
++ if (length > 0) {
++ mask = ~(~0UL << length);
++ if (new_value)
++ *bitmap_base++ |= mask;
++ else
++ *bitmap_base++ &= ~mask;
++ }
++}
++
++
++/*
++ * this changes the io permissions bitmap in the current task.
++ */
++asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
++{
++ struct thread_struct * t = &current->thread;
++ unsigned long *bitmap;
++ struct physdev_set_iobitmap set_iobitmap;
++
++ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
++ return -EINVAL;
++ if (turn_on && !capable(CAP_SYS_RAWIO))
++ return -EPERM;
++
++ /*
++ * If it's the first ioperm() call in this thread's lifetime, set the
++ * IO bitmap up. ioperm() is much less timing critical than clone(),
++ * this is why we delay this operation until now:
++ */
++ if (!t->io_bitmap_ptr) {
++ bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
++ if (!bitmap)
++ return -ENOMEM;
++
++ memset(bitmap, 0xff, IO_BITMAP_BYTES);
++ t->io_bitmap_ptr = bitmap;
++ set_thread_flag(TIF_IO_BITMAP);
++
++ set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
++ set_iobitmap.nr_ports = IO_BITMAP_BITS;
++ HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
++ }
++
++ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
++
++ return 0;
++}
++
++/*
++ * sys_iopl has to be used when you want to access the IO ports
++ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
++ * you'd need 8kB of bitmaps/process, which is a bit excessive.
++ *
++ * Here we just change the eflags value on the stack: we allow
++ * only the super-user to do it. This depends on the stack-layout
++ * on system-call entry - see also fork() and the signal handling
++ * code.
++ */
++
++asmlinkage long sys_iopl(unsigned long unused)
++{
++ volatile struct pt_regs * regs = (struct pt_regs *) &unused;
++ unsigned int level = regs->ebx;
++ struct thread_struct *t = &current->thread;
++ unsigned int old = (t->iopl >> 12) & 3;
++
++ if (level > 3)
++ return -EINVAL;
++ /* Trying to gain more privileges? */
++ if (level > old) {
++ if (!capable(CAP_SYS_RAWIO))
++ return -EPERM;
++ }
++ t->iopl = level << 12;
++ set_iopl_mask(t->iopl);
++ return 0;
++}
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/irq-xen.c 2007-08-27 14:02:03.000000000 -0400
+@@ -0,0 +1,324 @@
++/*
++ * linux/arch/i386/kernel/irq.c
++ *
++ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
++ *
++ * This file contains the lowest level x86-specific interrupt
++ * entry, irq-stacks and irq statistics code. All the remaining
++ * irq logic is done by the generic kernel/irq/ code and
++ * by the x86-specific irq controller code. (e.g. i8259.c and
++ * io_apic.c.)
++ */
++
++#include <asm/uaccess.h>
++#include <linux/module.h>
++#include <linux/seq_file.h>
++#include <linux/interrupt.h>
++#include <linux/kernel_stat.h>
++#include <linux/notifier.h>
++#include <linux/cpu.h>
++#include <linux/delay.h>
++
++DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
++EXPORT_PER_CPU_SYMBOL(irq_stat);
++
++#ifndef CONFIG_X86_LOCAL_APIC
++/*
++ * 'what should we do if we get a hw irq event on an illegal vector'.
++ * each architecture has to answer this themselves.
++ */
++void ack_bad_irq(unsigned int irq)
++{
++ printk("unexpected IRQ trap at vector %02x\n", irq);
++}
++#endif
++
++#ifdef CONFIG_4KSTACKS
++/*
++ * per-CPU IRQ handling contexts (thread information and stack)
++ */
++union irq_ctx {
++ struct thread_info tinfo;
++ u32 stack[THREAD_SIZE/sizeof(u32)];
++};
++
++static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
++static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
++#endif
++
++/*
++ * do_IRQ handles all normal device IRQ's (the special
++ * SMP cross-CPU interrupts have their own specific
++ * handlers).
++ */
++fastcall unsigned int do_IRQ(struct pt_regs *regs)
++{
++ /* high bit used in ret_from_ code */
++ int irq = ~regs->orig_eax;
++#ifdef CONFIG_4KSTACKS
++ union irq_ctx *curctx, *irqctx;
++ u32 *isp;
++#endif
++
++ if (unlikely((unsigned)irq >= NR_IRQS)) {
++ printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
++ __FUNCTION__, irq);
++ BUG();
++ }
++
++ irq_enter();
++#ifdef CONFIG_DEBUG_STACKOVERFLOW
++ /* Debugging check for stack overflow: is there less than 1KB free? */
++ {
++ long esp;
++
++ __asm__ __volatile__("andl %%esp,%0" :
++ "=r" (esp) : "0" (THREAD_SIZE - 1));
++ if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
++ printk("do_IRQ: stack overflow: %ld\n",
++ esp - sizeof(struct thread_info));
++ dump_stack();
++ }
++ }
++#endif
++
++#ifdef CONFIG_4KSTACKS
++
++ curctx = (union irq_ctx *) current_thread_info();
++ irqctx = hardirq_ctx[smp_processor_id()];
++
++ /*
++ * this is where we switch to the IRQ stack. However, if we are
++ * already using the IRQ stack (because we interrupted a hardirq
++ * handler) we can't do that and just have to keep using the
++ * current stack (which is the irq stack already after all)
++ */
++ if (curctx != irqctx) {
++ int arg1, arg2, ebx;
++
++ /* build the stack frame on the IRQ stack */
++ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
++ irqctx->tinfo.task = curctx->tinfo.task;
++ irqctx->tinfo.previous_esp = current_stack_pointer;
++
++ /*
++ * Copy the softirq bits in preempt_count so that the
++ * softirq checks work in the hardirq context.
++ */
++ irqctx->tinfo.preempt_count =
++ (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
++ (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
++
++ asm volatile(
++ " xchgl %%ebx,%%esp \n"
++ " call __do_IRQ \n"
++ " movl %%ebx,%%esp \n"
++ : "=a" (arg1), "=d" (arg2), "=b" (ebx)
++ : "0" (irq), "1" (regs), "2" (isp)
++ : "memory", "cc", "ecx"
++ );
++ } else
++#endif
++ __do_IRQ(irq, regs);
++
++ irq_exit();
++
++ return 1;
++}
++
++#ifdef CONFIG_4KSTACKS
++
++/*
++ * These should really be __section__(".bss.page_aligned") as well, but
++ * gcc's 3.0 and earlier don't handle that correctly.
++ */
++static char softirq_stack[NR_CPUS * THREAD_SIZE]
++ __attribute__((__aligned__(THREAD_SIZE)));
++
++static char hardirq_stack[NR_CPUS * THREAD_SIZE]
++ __attribute__((__aligned__(THREAD_SIZE)));
++
++/*
++ * allocate per-cpu stacks for hardirq and for softirq processing
++ */
++void irq_ctx_init(int cpu)
++{
++ union irq_ctx *irqctx;
++
++ if (hardirq_ctx[cpu])
++ return;
++
++ irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
++ irqctx->tinfo.task = NULL;
++ irqctx->tinfo.exec_domain = NULL;
++ irqctx->tinfo.cpu = cpu;
++ irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
++ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
++
++ hardirq_ctx[cpu] = irqctx;
++
++ irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
++ irqctx->tinfo.task = NULL;
++ irqctx->tinfo.exec_domain = NULL;
++ irqctx->tinfo.cpu = cpu;
++ irqctx->tinfo.preempt_count = 0;
++ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
++
++ softirq_ctx[cpu] = irqctx;
++
++ printk("CPU %u irqstacks, hard=%p soft=%p\n",
++ cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
++}
++
++void irq_ctx_exit(int cpu)
++{
++ hardirq_ctx[cpu] = NULL;
++}
++
++extern asmlinkage void __do_softirq(void);
++
++asmlinkage void do_softirq(void)
++{
++ unsigned long flags;
++ struct thread_info *curctx;
++ union irq_ctx *irqctx;
++ u32 *isp;
++
++ if (in_interrupt())
++ return;
++
++ local_irq_save(flags);
++
++ if (local_softirq_pending()) {
++ curctx = current_thread_info();
++ irqctx = softirq_ctx[smp_processor_id()];
++ irqctx->tinfo.task = curctx->task;
++ irqctx->tinfo.previous_esp = current_stack_pointer;
++
++ /* build the stack frame on the softirq stack */
++ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
++
++ asm volatile(
++ " xchgl %%ebx,%%esp \n"
++ " call __do_softirq \n"
++ " movl %%ebx,%%esp \n"
++ : "=b"(isp)
++ : "0"(isp)
++ : "memory", "cc", "edx", "ecx", "eax"
++ );
++ /*
++ * Shouldnt happen, we returned above if in_interrupt():
++ */
++ WARN_ON_ONCE(softirq_count());
++ }
++
++ local_irq_restore(flags);
++}
++
++EXPORT_SYMBOL(do_softirq);
++#endif
++
++/*
++ * Interrupt statistics:
++ */
++
++atomic_t irq_err_count;
++
++/*
++ * /proc/interrupts printing:
++ */
++
++int show_interrupts(struct seq_file *p, void *v)
++{
++ int i = *(loff_t *) v, j;
++ struct irqaction * action;
++ unsigned long flags;
++
++ if (i == 0) {
++ seq_printf(p, " ");
++ for_each_online_cpu(j)
++ seq_printf(p, "CPU%-8d",j);
++ seq_putc(p, '\n');
++ }
++
++ if (i < NR_IRQS) {
++ spin_lock_irqsave(&irq_desc[i].lock, flags);
++ action = irq_desc[i].action;
++ if (!action)
++ goto skip;
++ seq_printf(p, "%3d: ",i);
++#ifndef CONFIG_SMP
++ seq_printf(p, "%10u ", kstat_irqs(i));
++#else
++ for_each_online_cpu(j)
++ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
++#endif
++ seq_printf(p, " %14s", irq_desc[i].chip->typename);
++ seq_printf(p, " %s", action->name);
++
++ for (action=action->next; action; action = action->next)
++ seq_printf(p, ", %s", action->name);
++
++ seq_putc(p, '\n');
++skip:
++ spin_unlock_irqrestore(&irq_desc[i].lock, flags);
++ } else if (i == NR_IRQS) {
++ seq_printf(p, "NMI: ");
++ for_each_online_cpu(j)
++ seq_printf(p, "%10u ", nmi_count(j));
++ seq_putc(p, '\n');
++#ifdef CONFIG_X86_LOCAL_APIC
++ seq_printf(p, "LOC: ");
++ for_each_online_cpu(j)
++ seq_printf(p, "%10u ",
++ per_cpu(irq_stat,j).apic_timer_irqs);
++ seq_putc(p, '\n');
++#endif
++ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
++#if defined(CONFIG_X86_IO_APIC)
++ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
++#endif
++ }
++ return 0;
++}
++
++#ifdef CONFIG_HOTPLUG_CPU
++
++void fixup_irqs(cpumask_t map)
++{
++ unsigned int irq;
++ static int warned;
++
++ for (irq = 0; irq < NR_IRQS; irq++) {
++ cpumask_t mask;
++ if (irq == 2)
++ continue;
++
++ cpus_and(mask, irq_desc[irq].affinity, map);
++ if (any_online_cpu(mask) == NR_CPUS) {
++ /*printk("Breaking affinity for irq %i\n", irq);*/
++ mask = map;
++ }
++ if (irq_desc[irq].chip->set_affinity)
++ irq_desc[irq].chip->set_affinity(irq, mask);
++ else if (irq_desc[irq].action && !(warned++))
++ printk("Cannot set affinity for irq %i\n", irq);
++ }
++
++#if 0
++ barrier();
++ /* Ingo Molnar says: "after the IO-APIC masks have been redirected
++ [note the nop - the interrupt-enable boundary on x86 is two
++ instructions from sti] - to flush out pending hardirqs and
++ IPIs. After this point nothing is supposed to reach this CPU." */
++ __asm__ __volatile__("sti; nop; cli");
++ barrier();
++#else
++ /* That doesn't seem sufficient. Give it 1ms. */
++ local_irq_enable();
++ mdelay(1);
++ local_irq_disable();
++#endif
++}
++#endif
++
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/ldt-xen.c 2007-08-27 14:02:03.000000000 -0400
+@@ -0,0 +1,270 @@
++/*
++ * linux/kernel/ldt.c
++ *
++ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
++ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
++ */
++
++#include <linux/errno.h>
++#include <linux/sched.h>
++#include <linux/string.h>
++#include <linux/mm.h>
++#include <linux/smp.h>
++#include <linux/smp_lock.h>
++#include <linux/vmalloc.h>
++#include <linux/slab.h>
++
++#include <asm/uaccess.h>
++#include <asm/system.h>
++#include <asm/ldt.h>
++#include <asm/desc.h>
++#include <asm/mmu_context.h>
++
++#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
++static void flush_ldt(void *null)
++{
++ if (current->active_mm)
++ load_LDT(&current->active_mm->context);
++}
++#endif
++
++static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
++{
++ void *oldldt;
++ void *newldt;
++ int oldsize;
++
++ if (mincount <= pc->size)
++ return 0;
++ oldsize = pc->size;
++ mincount = (mincount+511)&(~511);
++ if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
++ newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
++ else
++ newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
++
++ if (!newldt)
++ return -ENOMEM;
++
++ if (oldsize)
++ memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
++ oldldt = pc->ldt;
++ memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
++ pc->ldt = newldt;
++ wmb();
++ pc->size = mincount;
++ wmb();
++
++ if (reload) {
++#ifdef CONFIG_SMP
++ cpumask_t mask;
++ preempt_disable();
++#endif
++ make_pages_readonly(
++ pc->ldt,
++ (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
++ XENFEAT_writable_descriptor_tables);
++ load_LDT(pc);
++#ifdef CONFIG_SMP
++ mask = cpumask_of_cpu(smp_processor_id());
++ if (!cpus_equal(current->mm->cpu_vm_mask, mask))
++ smp_call_function(flush_ldt, NULL, 1, 1);
++ preempt_enable();
++#endif
++ }
++ if (oldsize) {
++ make_pages_writable(
++ oldldt,
++ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
++ XENFEAT_writable_descriptor_tables);
++ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
++ vfree(oldldt);
++ else
++ kfree(oldldt);
++ }
++ return 0;
++}
++
++static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
++{
++ int err = alloc_ldt(new, old->size, 0);
++ if (err < 0)
++ return err;
++ memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
++ make_pages_readonly(
++ new->ldt,
++ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
++ XENFEAT_writable_descriptor_tables);
++ return 0;
++}
++
++/*
++ * we do not have to muck with descriptors here, that is
++ * done in switch_mm() as needed.
++ */
++int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
++{
++ struct mm_struct * old_mm;
++ int retval = 0;
++
++ init_MUTEX(&mm->context.sem);
++ mm->context.size = 0;
++ mm->context.has_foreign_mappings = 0;
++ old_mm = current->mm;
++ if (old_mm && old_mm->context.size > 0) {
++ down(&old_mm->context.sem);
++ retval = copy_ldt(&mm->context, &old_mm->context);
++ up(&old_mm->context.sem);
++ }
++ return retval;
++}
++
++/*
++ * No need to lock the MM as we are the last user
++ */
++void destroy_context(struct mm_struct *mm)
++{
++ if (mm->context.size) {
++ if (mm == current->active_mm)
++ clear_LDT();
++ make_pages_writable(
++ mm->context.ldt,
++ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
++ XENFEAT_writable_descriptor_tables);
++ if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
++ vfree(mm->context.ldt);
++ else
++ kfree(mm->context.ldt);
++ mm->context.size = 0;
++ }
++}
++
++static int read_ldt(void __user * ptr, unsigned long bytecount)
++{
++ int err;
++ unsigned long size;
++ struct mm_struct * mm = current->mm;
++
++ if (!mm->context.size)
++ return 0;
++ if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
++ bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
++
++ down(&mm->context.sem);
++ size = mm->context.size*LDT_ENTRY_SIZE;
++ if (size > bytecount)
++ size = bytecount;
++
++ err = 0;
++ if (copy_to_user(ptr, mm->context.ldt, size))
++ err = -EFAULT;
++ up(&mm->context.sem);
++ if (err < 0)
++ goto error_return;
++ if (size != bytecount) {
++ /* zero-fill the rest */
++ if (clear_user(ptr+size, bytecount-size) != 0) {
++ err = -EFAULT;
++ goto error_return;
++ }
++ }
++ return bytecount;
++error_return:
++ return err;
++}
++
++static int read_default_ldt(void __user * ptr, unsigned long bytecount)
++{
++ int err;
++ unsigned long size;
++ void *address;
++
++ err = 0;
++ address = &default_ldt[0];
++ size = 5*sizeof(struct desc_struct);
++ if (size > bytecount)
++ size = bytecount;
++
++ err = size;
++ if (copy_to_user(ptr, address, size))
++ err = -EFAULT;
++
++ return err;
++}
++
++static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
++{
++ struct mm_struct * mm = current->mm;
++ __u32 entry_1, entry_2;
++ int error;
++ struct user_desc ldt_info;
++
++ error = -EINVAL;
++ if (bytecount != sizeof(ldt_info))
++ goto out;
++ error = -EFAULT;
++ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
++ goto out;
++
++ error = -EINVAL;
++ if (ldt_info.entry_number >= LDT_ENTRIES)
++ goto out;
++ if (ldt_info.contents == 3) {
++ if (oldmode)
++ goto out;
++ if (ldt_info.seg_not_present == 0)
++ goto out;
++ }
++
++ down(&mm->context.sem);
++ if (ldt_info.entry_number >= mm->context.size) {
++ error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
++ if (error < 0)
++ goto out_unlock;
++ }
++
++ /* Allow LDTs to be cleared by the user. */
++ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
++ if (oldmode || LDT_empty(&ldt_info)) {
++ entry_1 = 0;
++ entry_2 = 0;
++ goto install;
++ }
++ }
++
++ entry_1 = LDT_entry_a(&ldt_info);
++ entry_2 = LDT_entry_b(&ldt_info);
++ if (oldmode)
++ entry_2 &= ~(1 << 20);
++
++ /* Install the new entry ... */
++install:
++ error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
++ entry_1, entry_2);
++
++out_unlock:
++ up(&mm->context.sem);
++out:
++ return error;
++}
++
++asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
++{
++ int ret = -ENOSYS;
++
++ switch (func) {
++ case 0:
++ ret = read_ldt(ptr, bytecount);
++ break;
++ case 1:
++ ret = write_ldt(ptr, bytecount, 1);
++ break;
++ case 2:
++ ret = read_default_ldt(ptr, bytecount);
++ break;
++ case 0x11:
++ ret = write_ldt(ptr, bytecount, 0);
++ break;
++ }
++ return ret;
++}
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/microcode-xen.c 2007-08-27 14:02:03.000000000 -0400
+@@ -0,0 +1,144 @@
++/*
++ * Intel CPU Microcode Update Driver for Linux
++ *
++ * Copyright (C) 2000-2004 Tigran Aivazian
++ *
++ * This driver allows to upgrade microcode on Intel processors
++ * belonging to IA-32 family - PentiumPro, Pentium II,
++ * Pentium III, Xeon, Pentium 4, etc.
++ *
++ * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
++ * Order Number 245472 or free download from:
++ *
++ * http://developer.intel.com/design/pentium4/manuals/245472.htm
++ *
++ * For more information, go to http://www.urbanmyth.org/microcode
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++//#define DEBUG /* pr_debug */
++#include <linux/capability.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/sched.h>
++#include <linux/cpumask.h>
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <linux/miscdevice.h>
++#include <linux/spinlock.h>
++#include <linux/mm.h>
++#include <linux/mutex.h>
++#include <linux/syscalls.h>
++
++#include <asm/msr.h>
++#include <asm/uaccess.h>
++#include <asm/processor.h>
++
++MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
++MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
++MODULE_LICENSE("GPL");
++
++static int verbose;
++module_param(verbose, int, 0644);
++
++#define MICROCODE_VERSION "1.14a-xen"
++
++#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
++#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
++#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
++
++/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
++static DEFINE_MUTEX(microcode_mutex);
++
++static int microcode_open (struct inode *unused1, struct file *unused2)
++{
++ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
++}
++
++
++static int do_microcode_update (const void __user *ubuf, size_t len)
++{
++ int err;
++ void *kbuf;
++
++ kbuf = vmalloc(len);
++ if (!kbuf)
++ return -ENOMEM;
++
++ if (copy_from_user(kbuf, ubuf, len) == 0) {
++ struct xen_platform_op op;
++
++ op.cmd = XENPF_microcode_update;
++ set_xen_guest_handle(op.u.microcode.data, kbuf);
++ op.u.microcode.length = len;
++ err = HYPERVISOR_platform_op(&op);
++ } else
++ err = -EFAULT;
++
++ vfree(kbuf);
++
++ return err;
++}
++
++static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
++{
++ ssize_t ret;
++
++ if (len < MC_HEADER_SIZE) {
++ printk(KERN_ERR "microcode: not enough data\n");
++ return -EINVAL;
++ }
++
++ mutex_lock(&microcode_mutex);
++
++ ret = do_microcode_update(buf, len);
++ if (!ret)
++ ret = (ssize_t)len;
++
++ mutex_unlock(&microcode_mutex);
++
++ return ret;
++}
++
++static struct file_operations microcode_fops = {
++ .owner = THIS_MODULE,
++ .write = microcode_write,
++ .open = microcode_open,
++};
++
++static struct miscdevice microcode_dev = {
++ .minor = MICROCODE_MINOR,
++ .name = "microcode",
++ .fops = &microcode_fops,
++};
++
++static int __init microcode_init (void)
++{
++ int error;
++
++ error = misc_register(&microcode_dev);
++ if (error) {
++ printk(KERN_ERR
++ "microcode: can't misc_register on minor=%d\n",
++ MICROCODE_MINOR);
++ return error;
++ }
++
++ printk(KERN_INFO
++ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
++ return 0;
++}
++
++static void __exit microcode_exit (void)
++{
++ misc_deregister(&microcode_dev);
++}
++
++module_init(microcode_init)
++module_exit(microcode_exit)
++MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/mpparse-xen.c 2007-08-27 14:02:03.000000000 -0400
+@@ -0,0 +1,1185 @@
++/*
++ * Intel Multiprocessor Specification 1.1 and 1.4
++ * compliant MP-table parsing routines.
++ *
++ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
++ * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
++ *
++ * Fixes
++ * Erich Boleyn : MP v1.4 and additional changes.
++ * Alan Cox : Added EBDA scanning
++ * Ingo Molnar : various cleanups and rewrites
++ * Maciej W. Rozycki: Bits for default MP configurations
++ * Paul Diefenbaugh: Added full ACPI support
++ */
++
++#include <linux/mm.h>
++#include <linux/init.h>
++#include <linux/acpi.h>
++#include <linux/delay.h>
++#include <linux/bootmem.h>
++#include <linux/smp_lock.h>
++#include <linux/kernel_stat.h>
++#include <linux/mc146818rtc.h>
++#include <linux/bitops.h>
++
++#include <asm/smp.h>
++#include <asm/acpi.h>
++#include <asm/mtrr.h>
++#include <asm/mpspec.h>
++#include <asm/io_apic.h>
++
++#include <mach_apic.h>
++#include <mach_mpparse.h>
++#include <bios_ebda.h>
++
++/* Have we found an MP table */
++int smp_found_config;
++unsigned int __initdata maxcpus = NR_CPUS;
++
++/*
++ * Various Linux-internal data structures created from the
++ * MP-table.
++ */
++int apic_version [MAX_APICS];
++int mp_bus_id_to_type [MAX_MP_BUSSES];
++int mp_bus_id_to_node [MAX_MP_BUSSES];
++int mp_bus_id_to_local [MAX_MP_BUSSES];
++int quad_local_to_mp_bus_id [NR_CPUS/4][4];
++int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
++static int mp_current_pci_id;
++
++/* I/O APIC entries */
++struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
++
++/* # of MP IRQ source entries */
++struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
++
++/* MP IRQ source entries */
++int mp_irq_entries;
++
++int nr_ioapics;
++
++int pic_mode;
++unsigned long mp_lapic_addr;
++
++unsigned int def_to_bigsmp = 0;
++
++/* Processor that is doing the boot up */
++unsigned int boot_cpu_physical_apicid = -1U;
++/* Internal processor count */
++static unsigned int __devinitdata num_processors;
++
++/* Bitmask of physically existing CPUs */
++physid_mask_t phys_cpu_present_map;
++
++u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
++
++/*
++ * Intel MP BIOS table parsing routines:
++ */
++
++
++/*
++ * Checksum an MP configuration block.
++ */
++
++static int __init mpf_checksum(unsigned char *mp, int len)
++{
++ int sum = 0;
++
++ while (len--)
++ sum += *mp++;
++
++ return sum & 0xFF;
++}
++
++/*
++ * Have to match translation table entries to main table entries by counter
++ * hence the mpc_record variable .... can't see a less disgusting way of
++ * doing this ....
++ */
++
++static int mpc_record;
++static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
++
++#ifndef CONFIG_XEN
++static void __devinit MP_processor_info (struct mpc_config_processor *m)
++{
++ int ver, apicid;
++ physid_mask_t phys_cpu;
++
++ if (!(m->mpc_cpuflag & CPU_ENABLED))
++ return;
++
++ apicid = mpc_apic_id(m, translation_table[mpc_record]);
++
++ if (m->mpc_featureflag&(1<<0))
++ Dprintk(" Floating point unit present.\n");
++ if (m->mpc_featureflag&(1<<7))
++ Dprintk(" Machine Exception supported.\n");
++ if (m->mpc_featureflag&(1<<8))
++ Dprintk(" 64 bit compare & exchange supported.\n");
++ if (m->mpc_featureflag&(1<<9))
++ Dprintk(" Internal APIC present.\n");
++ if (m->mpc_featureflag&(1<<11))
++ Dprintk(" SEP present.\n");
++ if (m->mpc_featureflag&(1<<12))
++ Dprintk(" MTRR present.\n");
++ if (m->mpc_featureflag&(1<<13))
++ Dprintk(" PGE present.\n");
++ if (m->mpc_featureflag&(1<<14))
++ Dprintk(" MCA present.\n");
++ if (m->mpc_featureflag&(1<<15))
++ Dprintk(" CMOV present.\n");
++ if (m->mpc_featureflag&(1<<16))
++ Dprintk(" PAT present.\n");
++ if (m->mpc_featureflag&(1<<17))
++ Dprintk(" PSE present.\n");
++ if (m->mpc_featureflag&(1<<18))
++ Dprintk(" PSN present.\n");
++ if (m->mpc_featureflag&(1<<19))
++ Dprintk(" Cache Line Flush Instruction present.\n");
++ /* 20 Reserved */
++ if (m->mpc_featureflag&(1<<21))
++ Dprintk(" Debug Trace and EMON Store present.\n");
++ if (m->mpc_featureflag&(1<<22))
++ Dprintk(" ACPI Thermal Throttle Registers present.\n");
++ if (m->mpc_featureflag&(1<<23))
++ Dprintk(" MMX present.\n");
++ if (m->mpc_featureflag&(1<<24))
++ Dprintk(" FXSR present.\n");
++ if (m->mpc_featureflag&(1<<25))
++ Dprintk(" XMM present.\n");
++ if (m->mpc_featureflag&(1<<26))
++ Dprintk(" Willamette New Instructions present.\n");
++ if (m->mpc_featureflag&(1<<27))
++ Dprintk(" Self Snoop present.\n");
++ if (m->mpc_featureflag&(1<<28))
++ Dprintk(" HT present.\n");
++ if (m->mpc_featureflag&(1<<29))
++ Dprintk(" Thermal Monitor present.\n");
++ /* 30, 31 Reserved */
++
++
++ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
++ Dprintk(" Bootup CPU\n");
++ boot_cpu_physical_apicid = m->mpc_apicid;
++ }
++
++ ver = m->mpc_apicver;
++
++ /*
++ * Validate version
++ */
++ if (ver == 0x0) {
++ printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
++ "fixing up to 0x10. (tell your hw vendor)\n",
++ m->mpc_apicid);
++ ver = 0x10;
++ }
++ apic_version[m->mpc_apicid] = ver;
++
++ phys_cpu = apicid_to_cpu_present(apicid);
++ physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
++
++ if (num_processors >= NR_CPUS) {
++ printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
++ " Processor ignored.\n", NR_CPUS);
++ return;
++ }
++
++ if (num_processors >= maxcpus) {
++ printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
++ " Processor ignored.\n", maxcpus);
++ return;
++ }
++
++ cpu_set(num_processors, cpu_possible_map);
++ num_processors++;
++
++ /*
++ * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
++ * but we need to work other dependencies like SMP_SUSPEND etc
++ * before this can be done without some confusion.
++ * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
++ * - Ashok Raj <ashok.raj@intel.com>
++ */
++ if (num_processors > 8) {
++ switch (boot_cpu_data.x86_vendor) {
++ case X86_VENDOR_INTEL:
++ if (!APIC_XAPIC(ver)) {
++ def_to_bigsmp = 0;
++ break;
++ }
++ /* If P4 and above fall through */
++ case X86_VENDOR_AMD:
++ def_to_bigsmp = 1;
++ }
++ }
++ bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
++}
++#else
++void __init MP_processor_info (struct mpc_config_processor *m)
++{
++ num_processors++;
++}
++#endif /* CONFIG_XEN */
++
++static void __init MP_bus_info (struct mpc_config_bus *m)
++{
++ char str[7];
++
++ memcpy(str, m->mpc_bustype, 6);
++ str[6] = 0;
++
++ mpc_oem_bus_info(m, str, translation_table[mpc_record]);
++
++ if (m->mpc_busid >= MAX_MP_BUSSES) {
++ printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
++ " is too large, max. supported is %d\n",
++ m->mpc_busid, str, MAX_MP_BUSSES - 1);
++ return;
++ }
++
++ if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
++ } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
++ } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
++ mpc_oem_pci_bus(m, translation_table[mpc_record]);
++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
++ mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
++ mp_current_pci_id++;
++ } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
++ } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
++ } else {
++ printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
++ }
++}
++
++static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
++{
++ if (!(m->mpc_flags & MPC_APIC_USABLE))
++ return;
++
++ printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
++ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
++ if (nr_ioapics >= MAX_IO_APICS) {
++ printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
++ MAX_IO_APICS, nr_ioapics);
++ panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
++ }
++ if (!m->mpc_apicaddr) {
++ printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
++ " found in MP table, skipping!\n");
++ return;
++ }
++ mp_ioapics[nr_ioapics] = *m;
++ nr_ioapics++;
++}
++
++static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
++{
++ mp_irqs [mp_irq_entries] = *m;
++ Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
++ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
++ m->mpc_irqtype, m->mpc_irqflag & 3,
++ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
++ m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
++ if (++mp_irq_entries == MAX_IRQ_SOURCES)
++ panic("Max # of irq sources exceeded!!\n");
++}
++
++static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
++{
++ Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
++ " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
++ m->mpc_irqtype, m->mpc_irqflag & 3,
++ (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
++ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
++ /*
++ * Well it seems all SMP boards in existence
++ * use ExtINT/LVT1 == LINT0 and
++ * NMI/LVT2 == LINT1 - the following check
++ * will show us if this assumptions is false.
++ * Until then we do not have to add baggage.
++ */
++ if ((m->mpc_irqtype == mp_ExtINT) &&
++ (m->mpc_destapiclint != 0))
++ BUG();
++ if ((m->mpc_irqtype == mp_NMI) &&
++ (m->mpc_destapiclint != 1))
++ BUG();
++}
++
++#ifdef CONFIG_X86_NUMAQ
++static void __init MP_translation_info (struct mpc_config_translation *m)
++{
++ printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
++
++ if (mpc_record >= MAX_MPC_ENTRY)
++ printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
++ else
++ translation_table[mpc_record] = m; /* stash this for later */
++ if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
++ node_set_online(m->trans_quad);
++}
++
++/*
++ * Read/parse the MPC oem tables
++ */
++
++static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
++ unsigned short oemsize)
++{
++ int count = sizeof (*oemtable); /* the header size */
++ unsigned char *oemptr = ((unsigned char *)oemtable)+count;
++
++ mpc_record = 0;
++ printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
++ if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
++ {
++ printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
++ oemtable->oem_signature[0],
++ oemtable->oem_signature[1],
++ oemtable->oem_signature[2],
++ oemtable->oem_signature[3]);
++ return;
++ }
++ if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
++ {
++ printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
++ return;
++ }
++ while (count < oemtable->oem_length) {
++ switch (*oemptr) {
++ case MP_TRANSLATION:
++ {
++ struct mpc_config_translation *m=
++ (struct mpc_config_translation *)oemptr;
++ MP_translation_info(m);
++ oemptr += sizeof(*m);
++ count += sizeof(*m);
++ ++mpc_record;
++ break;
++ }
++ default:
++ {
++ printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
++ return;
++ }
++ }
++ }
++}
++
++static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
++ char *productid)
++{
++ if (strncmp(oem, "IBM NUMA", 8))
++ printk("Warning! May not be a NUMA-Q system!\n");
++ if (mpc->mpc_oemptr)
++ smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
++ mpc->mpc_oemsize);
++}
++#endif /* CONFIG_X86_NUMAQ */
++
++/*
++ * Read/parse the MPC
++ */
++
++static int __init smp_read_mpc(struct mp_config_table *mpc)
++{
++ char str[16];
++ char oem[10];
++ int count=sizeof(*mpc);
++ unsigned char *mpt=((unsigned char *)mpc)+count;
++
++ if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
++ printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
++ *(u32 *)mpc->mpc_signature);
++ return 0;
++ }
++ if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
++ printk(KERN_ERR "SMP mptable: checksum error!\n");
++ return 0;
++ }
++ if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
++ printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
++ mpc->mpc_spec);
++ return 0;
++ }
++ if (!mpc->mpc_lapic) {
++ printk(KERN_ERR "SMP mptable: null local APIC address!\n");
++ return 0;
++ }
++ memcpy(oem,mpc->mpc_oem,8);
++ oem[8]=0;
++ printk(KERN_INFO "OEM ID: %s ",oem);
++
++ memcpy(str,mpc->mpc_productid,12);
++ str[12]=0;
++ printk("Product ID: %s ",str);
++
++ mps_oem_check(mpc, oem, str);
++
++ printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
++
++ /*
++ * Save the local APIC address (it might be non-default) -- but only
++ * if we're not using ACPI.
++ */
++ if (!acpi_lapic)
++ mp_lapic_addr = mpc->mpc_lapic;
++
++ /*
++ * Now process the configuration blocks.
++ */
++ mpc_record = 0;
++ while (count < mpc->mpc_length) {
++ switch(*mpt) {
++ case MP_PROCESSOR:
++ {
++ struct mpc_config_processor *m=
++ (struct mpc_config_processor *)mpt;
++ /* ACPI may have already provided this data */
++ if (!acpi_lapic)
++ MP_processor_info(m);
++ mpt += sizeof(*m);
++ count += sizeof(*m);
++ break;
++ }
++ case MP_BUS:
++ {
++ struct mpc_config_bus *m=
++ (struct mpc_config_bus *)mpt;
++ MP_bus_info(m);
++ mpt += sizeof(*m);
++ count += sizeof(*m);
++ break;
++ }
++ case MP_IOAPIC:
++ {
++ struct mpc_config_ioapic *m=
++ (struct mpc_config_ioapic *)mpt;
++ MP_ioapic_info(m);
++ mpt+=sizeof(*m);
++ count+=sizeof(*m);
++ break;
++ }
++ case MP_INTSRC:
++ {
++ struct mpc_config_intsrc *m=
++ (struct mpc_config_intsrc *)mpt;
++
++ MP_intsrc_info(m);
++ mpt+=sizeof(*m);
++ count+=sizeof(*m);
++ break;
++ }
++ case MP_LINTSRC:
++ {
++ struct mpc_config_lintsrc *m=
++ (struct mpc_config_lintsrc *)mpt;
++ MP_lintsrc_info(m);
++ mpt+=sizeof(*m);
++ count+=sizeof(*m);
++ break;
++ }
++ default:
++ {
++ count = mpc->mpc_length;
++ break;
++ }
++ }
++ ++mpc_record;
++ }
++ clustered_apic_check();
++ if (!num_processors)
++ printk(KERN_ERR "SMP mptable: no processors registered!\n");
++ return num_processors;
++}
++
++static int __init ELCR_trigger(unsigned int irq)
++{
++ unsigned int port;
++
++ port = 0x4d0 + (irq >> 3);
++ return (inb(port) >> (irq & 7)) & 1;
++}
++
++static void __init construct_default_ioirq_mptable(int mpc_default_type)
++{
++ struct mpc_config_intsrc intsrc;
++ int i;
++ int ELCR_fallback = 0;
++
++ intsrc.mpc_type = MP_INTSRC;
++ intsrc.mpc_irqflag = 0; /* conforming */
++ intsrc.mpc_srcbus = 0;
++ intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
++
++ intsrc.mpc_irqtype = mp_INT;
++
++ /*
++ * If true, we have an ISA/PCI system with no IRQ entries
++ * in the MP table. To prevent the PCI interrupts from being set up
++ * incorrectly, we try to use the ELCR. The sanity check to see if
++ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
++ * never be level sensitive, so we simply see if the ELCR agrees.
++ * If it does, we assume it's valid.
++ */
++ if (mpc_default_type == 5) {
++ printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
++
++ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
++ printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
++ else {
++ printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
++ ELCR_fallback = 1;
++ }
++ }
++
++ for (i = 0; i < 16; i++) {
++ switch (mpc_default_type) {
++ case 2:
++ if (i == 0 || i == 13)
++ continue; /* IRQ0 & IRQ13 not connected */
++ /* fall through */
++ default:
++ if (i == 2)
++ continue; /* IRQ2 is never connected */
++ }
++
++ if (ELCR_fallback) {
++ /*
++ * If the ELCR indicates a level-sensitive interrupt, we
++ * copy that information over to the MP table in the
++ * irqflag field (level sensitive, active high polarity).
++ */
++ if (ELCR_trigger(i))
++ intsrc.mpc_irqflag = 13;
++ else
++ intsrc.mpc_irqflag = 0;
++ }
++
++ intsrc.mpc_srcbusirq = i;
++ intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
++ MP_intsrc_info(&intsrc);
++ }
++
++ intsrc.mpc_irqtype = mp_ExtINT;
++ intsrc.mpc_srcbusirq = 0;
++ intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
++ MP_intsrc_info(&intsrc);
++}
++
++static inline void __init construct_default_ISA_mptable(int mpc_default_type)
++{
++ struct mpc_config_processor processor;
++ struct mpc_config_bus bus;
++ struct mpc_config_ioapic ioapic;
++ struct mpc_config_lintsrc lintsrc;
++ int linttypes[2] = { mp_ExtINT, mp_NMI };
++ int i;
++
++ /*
++ * local APIC has default address
++ */
++ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
++
++ /*
++ * 2 CPUs, numbered 0 & 1.
++ */
++ processor.mpc_type = MP_PROCESSOR;
++ /* Either an integrated APIC or a discrete 82489DX. */
++ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
++ processor.mpc_cpuflag = CPU_ENABLED;
++ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
++ (boot_cpu_data.x86_model << 4) |
++ boot_cpu_data.x86_mask;
++ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
++ processor.mpc_reserved[0] = 0;
++ processor.mpc_reserved[1] = 0;
++ for (i = 0; i < 2; i++) {
++ processor.mpc_apicid = i;
++ MP_processor_info(&processor);
++ }
++
++ bus.mpc_type = MP_BUS;
++ bus.mpc_busid = 0;
++ switch (mpc_default_type) {
++ default:
++ printk("???\n");
++ printk(KERN_ERR "Unknown standard configuration %d\n",
++ mpc_default_type);
++ /* fall through */
++ case 1:
++ case 5:
++ memcpy(bus.mpc_bustype, "ISA ", 6);
++ break;
++ case 2:
++ case 6:
++ case 3:
++ memcpy(bus.mpc_bustype, "EISA ", 6);
++ break;
++ case 4:
++ case 7:
++ memcpy(bus.mpc_bustype, "MCA ", 6);
++ }
++ MP_bus_info(&bus);
++ if (mpc_default_type > 4) {
++ bus.mpc_busid = 1;
++ memcpy(bus.mpc_bustype, "PCI ", 6);
++ MP_bus_info(&bus);
++ }
++
++ ioapic.mpc_type = MP_IOAPIC;
++ ioapic.mpc_apicid = 2;
++ ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
++ ioapic.mpc_flags = MPC_APIC_USABLE;
++ ioapic.mpc_apicaddr = 0xFEC00000;
++ MP_ioapic_info(&ioapic);
++
++ /*
++ * We set up most of the low 16 IO-APIC pins according to MPS rules.
++ */
++ construct_default_ioirq_mptable(mpc_default_type);
++
++ lintsrc.mpc_type = MP_LINTSRC;
++ lintsrc.mpc_irqflag = 0; /* conforming */
++ lintsrc.mpc_srcbusid = 0;
++ lintsrc.mpc_srcbusirq = 0;
++ lintsrc.mpc_destapic = MP_APIC_ALL;
++ for (i = 0; i < 2; i++) {
++ lintsrc.mpc_irqtype = linttypes[i];
++ lintsrc.mpc_destapiclint = i;
++ MP_lintsrc_info(&lintsrc);
++ }
++}
++
++static struct intel_mp_floating *mpf_found;
++
++/*
++ * Scan the memory blocks for an SMP configuration block.
++ */
++void __init get_smp_config (void)
++{
++ struct intel_mp_floating *mpf = mpf_found;
++
++ /*
++ * ACPI supports both logical (e.g. Hyper-Threading) and physical
++ * processors, where MPS only supports physical.
++ */
++ if (acpi_lapic && acpi_ioapic) {
++ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
++ return;
++ }
++ else if (acpi_lapic)
++ printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
++
++ printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
++ if (mpf->mpf_feature2 & (1<<7)) {
++ printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
++ pic_mode = 1;
++ } else {
++ printk(KERN_INFO " Virtual Wire compatibility mode.\n");
++ pic_mode = 0;
++ }
++
++ /*
++ * Now see if we need to read further.
++ */
++ if (mpf->mpf_feature1 != 0) {
++
++ printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
++ construct_default_ISA_mptable(mpf->mpf_feature1);
++
++ } else if (mpf->mpf_physptr) {
++
++ /*
++ * Read the physical hardware table. Anything here will
++ * override the defaults.
++ */
++ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
++ smp_found_config = 0;
++ printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
++ printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
++ return;
++ }
++ /*
++ * If there are no explicit MP IRQ entries, then we are
++ * broken. We set up most of the low 16 IO-APIC pins to
++ * ISA defaults and hope it will work.
++ */
++ if (!mp_irq_entries) {
++ struct mpc_config_bus bus;
++
++ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
++
++ bus.mpc_type = MP_BUS;
++ bus.mpc_busid = 0;
++ memcpy(bus.mpc_bustype, "ISA ", 6);
++ MP_bus_info(&bus);
++
++ construct_default_ioirq_mptable(0);
++ }
++
++ } else
++ BUG();
++
++ printk(KERN_INFO "Processors: %d\n", num_processors);
++ /*
++ * Only use the first configuration found.
++ */
++}
++
++static int __init smp_scan_config (unsigned long base, unsigned long length)
++{
++ unsigned long *bp = isa_bus_to_virt(base);
++ struct intel_mp_floating *mpf;
++
++ Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
++ if (sizeof(*mpf) != 16)
++ printk("Error: MPF size\n");
++
++ while (length > 0) {
++ mpf = (struct intel_mp_floating *)bp;
++ if ((*bp == SMP_MAGIC_IDENT) &&
++ (mpf->mpf_length == 1) &&
++ !mpf_checksum((unsigned char *)bp, 16) &&
++ ((mpf->mpf_specification == 1)
++ || (mpf->mpf_specification == 4)) ) {
++
++ smp_found_config = 1;
++#ifndef CONFIG_XEN
++ printk(KERN_INFO "found SMP MP-table at %08lx\n",
++ virt_to_phys(mpf));
++ reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
++ if (mpf->mpf_physptr) {
++ /*
++ * We cannot access to MPC table to compute
++ * table size yet, as only few megabytes from
++ * the bottom is mapped now.
++ * PC-9800's MPC table places on the very last
++ * of physical memory; so that simply reserving
++ * PAGE_SIZE from mpg->mpf_physptr yields BUG()
++ * in reserve_bootmem.
++ */
++ unsigned long size = PAGE_SIZE;
++ unsigned long end = max_low_pfn * PAGE_SIZE;
++ if (mpf->mpf_physptr + size > end)
++ size = end - mpf->mpf_physptr;
++ reserve_bootmem(mpf->mpf_physptr, size);
++ }
++#else
++ printk(KERN_INFO "found SMP MP-table at %08lx\n",
++ ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
++#endif
++
++ mpf_found = mpf;
++ return 1;
++ }
++ bp += 4;
++ length -= 16;
++ }
++ return 0;
++}
++
++void __init find_smp_config (void)
++{
++#ifndef CONFIG_XEN
++ unsigned int address;
++#endif
++
++ /*
++ * FIXME: Linux assumes you have 640K of base ram..
++ * this continues the error...
++ *
++ * 1) Scan the bottom 1K for a signature
++ * 2) Scan the top 1K of base RAM
++ * 3) Scan the 64K of bios
++ */
++ if (smp_scan_config(0x0,0x400) ||
++ smp_scan_config(639*0x400,0x400) ||
++ smp_scan_config(0xF0000,0x10000))
++ return;
++ /*
++ * If it is an SMP machine we should know now, unless the
++ * configuration is in an EISA/MCA bus machine with an
++ * extended bios data area.
++ *
++ * there is a real-mode segmented pointer pointing to the
++ * 4K EBDA area at 0x40E, calculate and scan it here.
++ *
++ * NOTE! There are Linux loaders that will corrupt the EBDA
++ * area, and as such this kind of SMP config may be less
++ * trustworthy, simply because the SMP table may have been
++ * stomped on during early boot. These loaders are buggy and
++ * should be fixed.
++ *
++ * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
++ */
++
++#ifndef CONFIG_XEN
++ address = get_bios_ebda();
++ if (address)
++ smp_scan_config(address, 0x400);
++#endif
++}
++
++int es7000_plat;
++
++/* --------------------------------------------------------------------------
++ ACPI-based MP Configuration
++ -------------------------------------------------------------------------- */
++
++#ifdef CONFIG_ACPI
++
++void __init mp_register_lapic_address (
++ u64 address)
++{
++#ifndef CONFIG_XEN
++ mp_lapic_addr = (unsigned long) address;
++
++ set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
++
++ if (boot_cpu_physical_apicid == -1U)
++ boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
++
++ Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
++#endif
++}
++
++
++void __devinit mp_register_lapic (
++ u8 id,
++ u8 enabled)
++{
++ struct mpc_config_processor processor;
++ int boot_cpu = 0;
++
++ if (MAX_APICS - id <= 0) {
++ printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
++ id, MAX_APICS);
++ return;
++ }
++
++ if (id == boot_cpu_physical_apicid)
++ boot_cpu = 1;
++
++#ifndef CONFIG_XEN
++ processor.mpc_type = MP_PROCESSOR;
++ processor.mpc_apicid = id;
++ processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
++ processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
++ processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
++ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
++ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
++ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
++ processor.mpc_reserved[0] = 0;
++ processor.mpc_reserved[1] = 0;
++#endif
++
++ MP_processor_info(&processor);
++}
++
++#ifdef CONFIG_X86_IO_APIC
++
++#define MP_ISA_BUS 0
++#define MP_MAX_IOAPIC_PIN 127
++
++static struct mp_ioapic_routing {
++ int apic_id;
++ int gsi_base;
++ int gsi_end;
++ u32 pin_programmed[4];
++} mp_ioapic_routing[MAX_IO_APICS];
++
++
++static int mp_find_ioapic (
++ int gsi)
++{
++ int i = 0;
++
++ /* Find the IOAPIC that manages this GSI. */
++ for (i = 0; i < nr_ioapics; i++) {
++ if ((gsi >= mp_ioapic_routing[i].gsi_base)
++ && (gsi <= mp_ioapic_routing[i].gsi_end))
++ return i;
++ }
++
++ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
++
++ return -1;
++}
++
++
++void __init mp_register_ioapic (
++ u8 id,
++ u32 address,
++ u32 gsi_base)
++{
++ int idx = 0;
++ int tmpid;
++
++ if (nr_ioapics >= MAX_IO_APICS) {
++ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
++ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
++ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
++ }
++ if (!address) {
++ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
++ " found in MADT table, skipping!\n");
++ return;
++ }
++
++ idx = nr_ioapics++;
++
++ mp_ioapics[idx].mpc_type = MP_IOAPIC;
++ mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
++ mp_ioapics[idx].mpc_apicaddr = address;
++
++#ifndef CONFIG_XEN
++ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
++#endif
++ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
++ && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
++ tmpid = io_apic_get_unique_id(idx, id);
++ else
++ tmpid = id;
++ if (tmpid == -1) {
++ nr_ioapics--;
++ return;
++ }
++ mp_ioapics[idx].mpc_apicid = tmpid;
++ mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
++
++ /*
++ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
++ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
++ */
++ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
++ mp_ioapic_routing[idx].gsi_base = gsi_base;
++ mp_ioapic_routing[idx].gsi_end = gsi_base +
++ io_apic_get_redir_entries(idx);
++
++ printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
++ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
++ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
++ mp_ioapic_routing[idx].gsi_base,
++ mp_ioapic_routing[idx].gsi_end);
++
++ return;
++}
++
++
++void __init mp_override_legacy_irq (
++ u8 bus_irq,
++ u8 polarity,
++ u8 trigger,
++ u32 gsi)
++{
++ struct mpc_config_intsrc intsrc;
++ int ioapic = -1;
++ int pin = -1;
++
++ /*
++ * Convert 'gsi' to 'ioapic.pin'.
++ */
++ ioapic = mp_find_ioapic(gsi);
++ if (ioapic < 0)
++ return;
++ pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
++
++ /*
++ * TBD: This check is for faulty timer entries, where the override
++ * erroneously sets the trigger to level, resulting in a HUGE
++ * increase of timer interrupts!
++ */
++ if ((bus_irq == 0) && (trigger == 3))
++ trigger = 1;
++
++ intsrc.mpc_type = MP_INTSRC;
++ intsrc.mpc_irqtype = mp_INT;
++ intsrc.mpc_irqflag = (trigger << 2) | polarity;
++ intsrc.mpc_srcbus = MP_ISA_BUS;
++ intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
++ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
++ intsrc.mpc_dstirq = pin; /* INTIN# */
++
++ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
++ intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
++ (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
++ intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
++
++ mp_irqs[mp_irq_entries] = intsrc;
++ if (++mp_irq_entries == MAX_IRQ_SOURCES)
++ panic("Max # of irq sources exceeded!\n");
++
++ return;
++}
++
++void __init mp_config_acpi_legacy_irqs (void)
++{
++ struct mpc_config_intsrc intsrc;
++ int i = 0;
++ int ioapic = -1;
++
++ /*
++ * Fabricate the legacy ISA bus (bus #31).
++ */
++ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
++ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
++
++ /*
++ * Older generations of ES7000 have no legacy identity mappings
++ */
++ if (es7000_plat == 1)
++ return;
++
++ /*
++ * Locate the IOAPIC that manages the ISA IRQs (0-15).
++ */
++ ioapic = mp_find_ioapic(0);
++ if (ioapic < 0)
++ return;
++
++ intsrc.mpc_type = MP_INTSRC;
++ intsrc.mpc_irqflag = 0; /* Conforming */
++ intsrc.mpc_srcbus = MP_ISA_BUS;
++ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
++
++ /*
++ * Use the default configuration for the IRQs 0-15. Unless
++ * overriden by (MADT) interrupt source override entries.
++ */
++ for (i = 0; i < 16; i++) {
++ int idx;
++
++ for (idx = 0; idx < mp_irq_entries; idx++) {
++ struct mpc_config_intsrc *irq = mp_irqs + idx;
++
++ /* Do we already have a mapping for this ISA IRQ? */
++ if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
++ break;
++
++ /* Do we already have a mapping for this IOAPIC pin */
++ if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
++ (irq->mpc_dstirq == i))
++ break;
++ }
++
++ if (idx != mp_irq_entries) {
++ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
++ continue; /* IRQ already used */
++ }
++
++ intsrc.mpc_irqtype = mp_INT;
++ intsrc.mpc_srcbusirq = i; /* Identity mapped */
++ intsrc.mpc_dstirq = i;
++
++ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
++ "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
++ (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
++ intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
++ intsrc.mpc_dstirq);
++
++ mp_irqs[mp_irq_entries] = intsrc;
++ if (++mp_irq_entries == MAX_IRQ_SOURCES)
++ panic("Max # of irq sources exceeded!\n");
++ }
++}
++
++#define MAX_GSI_NUM 4096
++
++int mp_register_gsi (u32 gsi, int triggering, int polarity)
++{
++ int ioapic = -1;
++ int ioapic_pin = 0;
++ int idx, bit = 0;
++ static int pci_irq = 16;
++ /*
++ * Mapping between Global System Interrups, which
++ * represent all possible interrupts, and IRQs
++ * assigned to actual devices.
++ */
++ static int gsi_to_irq[MAX_GSI_NUM];
++
++ /* Don't set up the ACPI SCI because it's already set up */
++ if (acpi_fadt.sci_int == gsi)
++ return gsi;
++
++ ioapic = mp_find_ioapic(gsi);
++ if (ioapic < 0) {
++ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
++ return gsi;
++ }
++
++ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
++
++ if (ioapic_renumber_irq)
++ gsi = ioapic_renumber_irq(ioapic, gsi);
++
++ /*
++ * Avoid pin reprogramming. PRTs typically include entries
++ * with redundant pin->gsi mappings (but unique PCI devices);
++ * we only program the IOAPIC on the first.
++ */
++ bit = ioapic_pin % 32;
++ idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
++ if (idx > 3) {
++ printk(KERN_ERR "Invalid reference to IOAPIC pin "
++ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
++ ioapic_pin);
++ return gsi;
++ }
++ if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
++ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
++ mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
++ return gsi_to_irq[gsi];
++ }
++
++ mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
++
++ if (triggering == ACPI_LEVEL_SENSITIVE) {
++ /*
++ * For PCI devices assign IRQs in order, avoiding gaps
++ * due to unused I/O APIC pins.
++ */
++ int irq = gsi;
++ if (gsi < MAX_GSI_NUM) {
++ /*
++ * Retain the VIA chipset work-around (gsi > 15), but
++ * avoid a problem where the 8254 timer (IRQ0) is setup
++ * via an override (so it's not on pin 0 of the ioapic),
++ * and at the same time, the pin 0 interrupt is a PCI
++ * type. The gsi > 15 test could cause these two pins
++ * to be shared as IRQ0, and they are not shareable.
++ * So test for this condition, and if necessary, avoid
++ * the pin collision.
++ */
++ if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
++ gsi = pci_irq++;
++ /*
++ * Don't assign IRQ used by ACPI SCI
++ */
++ if (gsi == acpi_fadt.sci_int)
++ gsi = pci_irq++;
++ gsi_to_irq[irq] = gsi;
++ } else {
++ printk(KERN_ERR "GSI %u is too high\n", gsi);
++ return gsi;
++ }
++ }
++
++ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
++ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
++ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
++ return gsi;
++}
++
++#endif /* CONFIG_X86_IO_APIC */
++#endif /* CONFIG_ACPI */
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/pci-dma-xen.c 2007-08-27 14:02:10.000000000 -0400
+@@ -0,0 +1,366 @@
++/*
++ * Dynamic DMA mapping support.
++ *
++ * On i386 there is no hardware dynamic DMA address translation,
++ * so consistent alloc/free are merely page allocation/freeing.
++ * The rest of the dynamic DMA mapping interface is implemented
++ * in asm/pci.h.
++ */
++
++#include <linux/types.h>
++#include <linux/mm.h>
++#include <linux/string.h>
++#include <linux/pci.h>
++#include <linux/module.h>
++#include <linux/version.h>
++#include <asm/io.h>
++#include <xen/balloon.h>
++#include <asm/swiotlb.h>
++#include <asm/tlbflush.h>
++#include <asm-i386/mach-xen/asm/swiotlb.h>
++#include <asm/bug.h>
++
++#ifdef __x86_64__
++#include <asm/proto.h>
++
++int iommu_merge __read_mostly = 0;
++EXPORT_SYMBOL(iommu_merge);
++
++dma_addr_t bad_dma_address __read_mostly;
++EXPORT_SYMBOL(bad_dma_address);
++
++/* This tells the BIO block layer to assume merging. Default to off
++ because we cannot guarantee merging later. */
++int iommu_bio_merge __read_mostly = 0;
++EXPORT_SYMBOL(iommu_bio_merge);
++
++int force_iommu __read_mostly= 0;
++
++__init int iommu_setup(char *p)
++{
++ return 1;
++}
++
++void __init pci_iommu_alloc(void)
++{
++#ifdef CONFIG_SWIOTLB
++ pci_swiotlb_init();
++#endif
++}
++
++static int __init pci_iommu_init(void)
++{
++ no_iommu_init();
++ return 0;
++}
++
++/* Must execute after PCI subsystem */
++fs_initcall(pci_iommu_init);
++#endif
++
++struct dma_coherent_mem {
++ void *virt_base;
++ u32 device_base;
++ int size;
++ int flags;
++ unsigned long *bitmap;
++};
++
++#define IOMMU_BUG_ON(test) \
++do { \
++ if (unlikely(test)) { \
++ printk(KERN_ALERT "Fatal DMA error! " \
++ "Please use 'swiotlb=force'\n"); \
++ BUG(); \
++ } \
++} while (0)
++
++int
++dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
++ enum dma_data_direction direction)
++{
++ int i, rc;
++
++ if (direction == DMA_NONE)
++ BUG();
++ WARN_ON(nents == 0 || sg[0].length == 0);
++
++ if (swiotlb) {
++ rc = swiotlb_map_sg(hwdev, sg, nents, direction);
++ } else {
++ for (i = 0; i < nents; i++ ) {
++ sg[i].dma_address =
++ page_to_bus(sg[i].page) + sg[i].offset;
++ sg[i].dma_length = sg[i].length;
++ BUG_ON(!sg[i].page);
++ IOMMU_BUG_ON(address_needs_mapping(
++ hwdev, sg[i].dma_address));
++ }
++ rc = nents;
++ }
++
++ flush_write_buffers();
++ return rc;
++}
++EXPORT_SYMBOL(dma_map_sg);
++
++void
++dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
++ enum dma_data_direction direction)
++{
++ BUG_ON(direction == DMA_NONE);
++ if (swiotlb)
++ swiotlb_unmap_sg(hwdev, sg, nents, direction);
++}
++EXPORT_SYMBOL(dma_unmap_sg);
++
++#ifdef CONFIG_HIGHMEM
++dma_addr_t
++dma_map_page(struct device *dev, struct page *page, unsigned long offset,
++ size_t size, enum dma_data_direction direction)
++{
++ dma_addr_t dma_addr;
++
++ BUG_ON(direction == DMA_NONE);
++
++ if (swiotlb) {
++ dma_addr = swiotlb_map_page(
++ dev, page, offset, size, direction);
++ } else {
++ dma_addr = page_to_bus(page) + offset;
++ IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
++ }
++
++ return dma_addr;
++}
++EXPORT_SYMBOL(dma_map_page);
++
++void
++dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
++ enum dma_data_direction direction)
++{
++ BUG_ON(direction == DMA_NONE);
++ if (swiotlb)
++ swiotlb_unmap_page(dev, dma_address, size, direction);
++}
++EXPORT_SYMBOL(dma_unmap_page);
++#endif /* CONFIG_HIGHMEM */
++
++int
++dma_mapping_error(dma_addr_t dma_addr)
++{
++ if (swiotlb)
++ return swiotlb_dma_mapping_error(dma_addr);
++ return 0;
++}
++EXPORT_SYMBOL(dma_mapping_error);
++
++int
++dma_supported(struct device *dev, u64 mask)
++{
++ if (swiotlb)
++ return swiotlb_dma_supported(dev, mask);
++ /*
++ * By default we'll BUG when an infeasible DMA is requested, and
++ * request swiotlb=force (see IOMMU_BUG_ON).
++ */
++ return 1;
++}
++EXPORT_SYMBOL(dma_supported);
++
++void *dma_alloc_coherent(struct device *dev, size_t size,
++ dma_addr_t *dma_handle, gfp_t gfp)
++{
++ void *ret;
++ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
++ unsigned int order = get_order(size);
++ unsigned long vstart;
++ u64 mask;
++
++ /* ignore region specifiers */
++ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
++
++ if (mem) {
++ int page = bitmap_find_free_region(mem->bitmap, mem->size,
++ order);
++ if (page >= 0) {
++ *dma_handle = mem->device_base + (page << PAGE_SHIFT);
++ ret = mem->virt_base + (page << PAGE_SHIFT);
++ memset(ret, 0, size);
++ return ret;
++ }
++ if (mem->flags & DMA_MEMORY_EXCLUSIVE)
++ return NULL;
++ }
++
++ if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
++ gfp |= GFP_DMA;
++
++ vstart = __get_free_pages(gfp, order);
++ ret = (void *)vstart;
++
++ if (dev != NULL && dev->coherent_dma_mask)
++ mask = dev->coherent_dma_mask;
++ else
++ mask = 0xffffffff;
++
++ if (ret != NULL) {
++ if (xen_create_contiguous_region(vstart, order,
++ fls64(mask)) != 0) {
++ free_pages(vstart, order);
++ return NULL;
++ }
++ memset(ret, 0, size);
++ *dma_handle = virt_to_bus(ret);
++ }
++ return ret;
++}
++EXPORT_SYMBOL(dma_alloc_coherent);
++
++void dma_free_coherent(struct device *dev, size_t size,
++ void *vaddr, dma_addr_t dma_handle)
++{
++ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
++ int order = get_order(size);
++
++ if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
++ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
++
++ bitmap_release_region(mem->bitmap, page, order);
++ } else {
++ xen_destroy_contiguous_region((unsigned long)vaddr, order);
++ free_pages((unsigned long)vaddr, order);
++ }
++}
++EXPORT_SYMBOL(dma_free_coherent);
++
++#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
++int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
++ dma_addr_t device_addr, size_t size, int flags)
++{
++ void __iomem *mem_base;
++ int pages = size >> PAGE_SHIFT;
++ int bitmap_size = (pages + 31)/32;
++
++ if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
++ goto out;
++ if (!size)
++ goto out;
++ if (dev->dma_mem)
++ goto out;
++
++ /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
++
++ mem_base = ioremap(bus_addr, size);
++ if (!mem_base)
++ goto out;
++
++ dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
++ if (!dev->dma_mem)
++ goto out;
++ memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
++ dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
++ if (!dev->dma_mem->bitmap)
++ goto free1_out;
++ memset(dev->dma_mem->bitmap, 0, bitmap_size);
++
++ dev->dma_mem->virt_base = mem_base;
++ dev->dma_mem->device_base = device_addr;
++ dev->dma_mem->size = pages;
++ dev->dma_mem->flags = flags;
++
++ if (flags & DMA_MEMORY_MAP)
++ return DMA_MEMORY_MAP;
++
++ return DMA_MEMORY_IO;
++
++ free1_out:
++ kfree(dev->dma_mem->bitmap);
++ out:
++ return 0;
++}
++EXPORT_SYMBOL(dma_declare_coherent_memory);
++
++void dma_release_declared_memory(struct device *dev)
++{
++ struct dma_coherent_mem *mem = dev->dma_mem;
++
++ if(!mem)
++ return;
++ dev->dma_mem = NULL;
++ iounmap(mem->virt_base);
++ kfree(mem->bitmap);
++ kfree(mem);
++}
++EXPORT_SYMBOL(dma_release_declared_memory);
++
++void *dma_mark_declared_memory_occupied(struct device *dev,
++ dma_addr_t device_addr, size_t size)
++{
++ struct dma_coherent_mem *mem = dev->dma_mem;
++ int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
++ int pos, err;
++
++ if (!mem)
++ return ERR_PTR(-EINVAL);
++
++ pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
++ err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
++ if (err != 0)
++ return ERR_PTR(err);
++ return mem->virt_base + (pos << PAGE_SHIFT);
++}
++EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
++#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
++
++dma_addr_t
++dma_map_single(struct device *dev, void *ptr, size_t size,
++ enum dma_data_direction direction)
++{
++ dma_addr_t dma;
++
++ if (direction == DMA_NONE)
++ BUG();
++ WARN_ON(size == 0);
++
++ if (swiotlb) {
++ dma = swiotlb_map_single(dev, ptr, size, direction);
++ } else {
++ dma = virt_to_bus(ptr);
++ IOMMU_BUG_ON(range_straddles_page_boundary(ptr, size));
++ IOMMU_BUG_ON(address_needs_mapping(dev, dma));
++ }
++
++ flush_write_buffers();
++ return dma;
++}
++EXPORT_SYMBOL(dma_map_single);
++
++void
++dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
++ enum dma_data_direction direction)
++{
++ if (direction == DMA_NONE)
++ BUG();
++ if (swiotlb)
++ swiotlb_unmap_single(dev, dma_addr, size, direction);
++}
++EXPORT_SYMBOL(dma_unmap_single);
++
++void
++dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
++ enum dma_data_direction direction)
++{
++ if (swiotlb)
++ swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
++}
++EXPORT_SYMBOL(dma_sync_single_for_cpu);
++
++void
++dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
++ enum dma_data_direction direction)
++{
++ if (swiotlb)
++ swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
++}
++EXPORT_SYMBOL(dma_sync_single_for_device);
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/process-xen.c 2007-08-27 14:02:03.000000000 -0400
+@@ -0,0 +1,853 @@
++/*
++ * linux/arch/i386/kernel/process.c
++ *
++ * Copyright (C) 1995 Linus Torvalds
++ *
++ * Pentium III FXSR, SSE support
++ * Gareth Hughes <gareth@valinux.com>, May 2000
++ */
++
++/*
++ * This file handles the architecture-dependent parts of process handling..
++ */
++
++#include <stdarg.h>
++
++#include <linux/cpu.h>
++#include <linux/errno.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/elfcore.h>
++#include <linux/smp.h>
++#include <linux/smp_lock.h>
++#include <linux/stddef.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <linux/user.h>
++#include <linux/a.out.h>
++#include <linux/interrupt.h>
++#include <linux/utsname.h>
++#include <linux/delay.h>
++#include <linux/reboot.h>
++#include <linux/init.h>
++#include <linux/mc146818rtc.h>
++#include <linux/module.h>
++#include <linux/kallsyms.h>
++#include <linux/ptrace.h>
++#include <linux/random.h>
++
++#include <asm/uaccess.h>
++#include <asm/pgtable.h>
++#include <asm/system.h>
++#include <asm/io.h>
++#include <asm/ldt.h>
++#include <asm/processor.h>
++#include <asm/i387.h>
++#include <asm/desc.h>
++#include <asm/vm86.h>
++#ifdef CONFIG_MATH_EMULATION
++#include <asm/math_emu.h>
++#endif
++
++#include <xen/interface/physdev.h>
++#include <xen/interface/vcpu.h>
++#include <xen/cpu_hotplug.h>
++
++#include <linux/err.h>
++
++#include <asm/tlbflush.h>
++#include <asm/cpu.h>
++
++asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
++
++static int hlt_counter;
++
++unsigned long boot_option_idle_override = 0;
++EXPORT_SYMBOL(boot_option_idle_override);
++
++/*
++ * Return saved PC of a blocked thread.
++ */
++unsigned long thread_saved_pc(struct task_struct *tsk)
++{
++ return ((unsigned long *)tsk->thread.esp)[3];
++}
++
++/*
++ * Powermanagement idle function, if any..
++ */
++void (*pm_idle)(void);
++EXPORT_SYMBOL(pm_idle);
++static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
++
++void disable_hlt(void)
++{
++ hlt_counter++;
++}
++
++EXPORT_SYMBOL(disable_hlt);
++
++void enable_hlt(void)
++{
++ hlt_counter--;
++}
++
++EXPORT_SYMBOL(enable_hlt);
++
++/*
++ * On SMP it's slightly faster (but much more power-consuming!)
++ * to poll the ->work.need_resched flag instead of waiting for the
++ * cross-CPU IPI to arrive. Use this option with caution.
++ */
++static void poll_idle (void)
++{
++ local_irq_enable();
++
++ asm volatile(
++ "2:"
++ "testl %0, %1;"
++ "rep; nop;"
++ "je 2b;"
++ : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
++}
++
++static void xen_idle(void)
++{
++ local_irq_disable();
++
++ if (need_resched())
++ local_irq_enable();
++ else {
++ current_thread_info()->status &= ~TS_POLLING;
++ smp_mb__after_clear_bit();
++ safe_halt();
++ current_thread_info()->status |= TS_POLLING;
++ }
++}
++#ifdef CONFIG_APM_MODULE
++EXPORT_SYMBOL(default_idle);
++#endif
++
++#ifdef CONFIG_HOTPLUG_CPU
++extern cpumask_t cpu_initialized;
++static inline void play_dead(void)
++{
++ idle_task_exit();
++ local_irq_disable();
++ cpu_clear(smp_processor_id(), cpu_initialized);
++ preempt_enable_no_resched();
++ HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
++ cpu_bringup();
++}
++#else
++static inline void play_dead(void)
++{
++ BUG();
++}
++#endif /* CONFIG_HOTPLUG_CPU */
++
++/*
++ * The idle thread. There's no useful work to be
++ * done, so just try to conserve power and have a
++ * low exit latency (ie sit in a loop waiting for
++ * somebody to say that they'd like to reschedule)
++ */
++void cpu_idle(void)
++{
++ int cpu = smp_processor_id();
++
++ current_thread_info()->status |= TS_POLLING;
++
++ /* endless idle loop with no priority at all */
++ while (1) {
++ while (!need_resched()) {
++ void (*idle)(void);
++
++ if (__get_cpu_var(cpu_idle_state))
++ __get_cpu_var(cpu_idle_state) = 0;
++
++ rmb();
++ idle = xen_idle; /* no alternatives */
++
++ if (cpu_is_offline(cpu))
++ play_dead();
++
++ __get_cpu_var(irq_stat).idle_timestamp = jiffies;
++ idle();
++ }
++ preempt_enable_no_resched();
++ schedule();
++ preempt_disable();
++ }
++}
++
++void cpu_idle_wait(void)
++{
++ unsigned int cpu, this_cpu = get_cpu();
++ cpumask_t map;
++
++ set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
++ put_cpu();
++
++ cpus_clear(map);
++ for_each_online_cpu(cpu) {
++ per_cpu(cpu_idle_state, cpu) = 1;
++ cpu_set(cpu, map);
++ }
++
++ __get_cpu_var(cpu_idle_state) = 0;
++
++ wmb();
++ do {
++ ssleep(1);
++ for_each_online_cpu(cpu) {
++ if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
++ cpu_clear(cpu, map);
++ }
++ cpus_and(map, map, cpu_online_map);
++ } while (!cpus_empty(map));
++}
++EXPORT_SYMBOL_GPL(cpu_idle_wait);
++
++void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
++{
++}
++
++static int __init idle_setup (char *str)
++{
++ if (!strncmp(str, "poll", 4)) {
++ printk("using polling idle threads.\n");
++ pm_idle = poll_idle;
++ }
++
++ boot_option_idle_override = 1;
++ return 1;
++}
++
++__setup("idle=", idle_setup);
++
++void show_regs(struct pt_regs * regs)
++{
++ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
++
++ printk("\n");
++ printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
++ printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
++ print_symbol("EIP is at %s\n", regs->eip);
++
++ if (user_mode_vm(regs))
++ printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
++ printk(" EFLAGS: %08lx %s (%s %.*s)\n",
++ regs->eflags, print_tainted(), system_utsname.release,
++ (int)strcspn(system_utsname.version, " "),
++ system_utsname.version);
++ printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
++ regs->eax,regs->ebx,regs->ecx,regs->edx);
++ printk("ESI: %08lx EDI: %08lx EBP: %08lx",
++ regs->esi, regs->edi, regs->ebp);
++ printk(" DS: %04x ES: %04x\n",
++ 0xffff & regs->xds,0xffff & regs->xes);
++
++ cr0 = read_cr0();
++ cr2 = read_cr2();
++ cr3 = read_cr3();
++ cr4 = read_cr4_safe();
++ printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
++ show_trace(NULL, regs, &regs->esp);
++}
++
++/*
++ * This gets run with %ebx containing the
++ * function to call, and %edx containing
++ * the "args".
++ */
++extern void kernel_thread_helper(void);
++__asm__(".section .text\n"
++ ".align 4\n"
++ "kernel_thread_helper:\n\t"
++ "movl %edx,%eax\n\t"
++ "pushl %edx\n\t"
++ "call *%ebx\n\t"
++ "pushl %eax\n\t"
++ "call do_exit\n"
++ ".previous");
++
++/*
++ * Create a kernel thread
++ */
++int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
++{
++ struct pt_regs regs;
++
++ memset(&regs, 0, sizeof(regs));
++
++ regs.ebx = (unsigned long) fn;
++ regs.edx = (unsigned long) arg;
++
++ regs.xds = __USER_DS;
++ regs.xes = __USER_DS;
++ regs.orig_eax = -1;
++ regs.eip = (unsigned long) kernel_thread_helper;
++ regs.xcs = GET_KERNEL_CS();
++ regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
++
++ /* Ok, create the new process.. */
++ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
++}
++EXPORT_SYMBOL(kernel_thread);
++
++/*
++ * Free current thread data structures etc..
++ */
++void exit_thread(void)
++{
++ /* The process may have allocated an io port bitmap... nuke it. */
++ if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
++ struct task_struct *tsk = current;
++ struct thread_struct *t = &tsk->thread;
++ struct physdev_set_iobitmap set_iobitmap;
++ memset(&set_iobitmap, 0, sizeof(set_iobitmap));
++ HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
++ kfree(t->io_bitmap_ptr);
++ t->io_bitmap_ptr = NULL;
++ clear_thread_flag(TIF_IO_BITMAP);
++ }
++}
++
++void flush_thread(void)
++{
++ struct task_struct *tsk = current;
++
++ memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
++ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
++ clear_tsk_thread_flag(tsk, TIF_DEBUG);
++ /*
++ * Forget coprocessor state..
++ */
++ clear_fpu(tsk);
++ clear_used_math();
++}
++
++void release_thread(struct task_struct *dead_task)
++{
++ BUG_ON(dead_task->mm);
++ release_vm86_irqs(dead_task);
++}
++
++/*
++ * This gets called before we allocate a new thread and copy
++ * the current task into it.
++ */
++void prepare_to_copy(struct task_struct *tsk)
++{
++ unlazy_fpu(tsk);
++}
++
++int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
++ unsigned long unused,
++ struct task_struct * p, struct pt_regs * regs)
++{
++ struct pt_regs * childregs;
++ struct task_struct *tsk;
++ int err;
++
++ childregs = task_pt_regs(p);
++ *childregs = *regs;
++ childregs->eax = 0;
++ childregs->esp = esp;
++
++ p->thread.esp = (unsigned long) childregs;
++ p->thread.esp0 = (unsigned long) (childregs+1);
++
++ p->thread.eip = (unsigned long) ret_from_fork;
++
++ savesegment(fs,p->thread.fs);
++ savesegment(gs,p->thread.gs);
++
++ tsk = current;
++ if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
++ p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
++ if (!p->thread.io_bitmap_ptr) {
++ p->thread.io_bitmap_max = 0;
++ return -ENOMEM;
++ }
++ memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
++ IO_BITMAP_BYTES);
++ set_tsk_thread_flag(p, TIF_IO_BITMAP);
++ }
++
++ /*
++ * Set a new TLS for the child thread?
++ */
++ if (clone_flags & CLONE_SETTLS) {
++ struct desc_struct *desc;
++ struct user_desc info;
++ int idx;
++
++ err = -EFAULT;
++ if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
++ goto out;
++ err = -EINVAL;
++ if (LDT_empty(&info))
++ goto out;
++
++ idx = info.entry_number;
++ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
++ goto out;
++
++ desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
++ desc->a = LDT_entry_a(&info);
++ desc->b = LDT_entry_b(&info);
++ }
++
++ p->thread.iopl = current->thread.iopl;
++
++ err = 0;
++ out:
++ if (err && p->thread.io_bitmap_ptr) {
++ kfree(p->thread.io_bitmap_ptr);
++ p->thread.io_bitmap_max = 0;
++ }
++ return err;
++}
++
++/*
++ * fill in the user structure for a core dump..
++ */
++void dump_thread(struct pt_regs * regs, struct user * dump)
++{
++ int i;
++
++/* changed the size calculations - should hopefully work better. lbt */
++ dump->magic = CMAGIC;
++ dump->start_code = 0;
++ dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
++ dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
++ dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
++ dump->u_dsize -= dump->u_tsize;
++ dump->u_ssize = 0;
++ for (i = 0; i < 8; i++)
++ dump->u_debugreg[i] = current->thread.debugreg[i];
++
++ if (dump->start_stack < TASK_SIZE)
++ dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
++
++ dump->regs.ebx = regs->ebx;
++ dump->regs.ecx = regs->ecx;
++ dump->regs.edx = regs->edx;
++ dump->regs.esi = regs->esi;
++ dump->regs.edi = regs->edi;
++ dump->regs.ebp = regs->ebp;
++ dump->regs.eax = regs->eax;
++ dump->regs.ds = regs->xds;
++ dump->regs.es = regs->xes;
++ savesegment(fs,dump->regs.fs);
++ savesegment(gs,dump->regs.gs);
++ dump->regs.orig_eax = regs->orig_eax;
++ dump->regs.eip = regs->eip;
++ dump->regs.cs = regs->xcs;
++ dump->regs.eflags = regs->eflags;
++ dump->regs.esp = regs->esp;
++ dump->regs.ss = regs->xss;
++
++ dump->u_fpvalid = dump_fpu (regs, &dump->i387);
++}
++EXPORT_SYMBOL(dump_thread);
++
++/*
++ * Capture the user space registers if the task is not running (in user space)
++ */
++int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
++{
++ struct pt_regs ptregs = *task_pt_regs(tsk);
++ ptregs.xcs &= 0xffff;
++ ptregs.xds &= 0xffff;
++ ptregs.xes &= 0xffff;
++ ptregs.xss &= 0xffff;
++
++ elf_core_copy_regs(regs, &ptregs);
++
++ return 1;
++}
++
++static noinline void __switch_to_xtra(struct task_struct *next_p)
++{
++ struct thread_struct *next;
++
++ next = &next_p->thread;
++
++ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
++ set_debugreg(next->debugreg[0], 0);
++ set_debugreg(next->debugreg[1], 1);
++ set_debugreg(next->debugreg[2], 2);
++ set_debugreg(next->debugreg[3], 3);
++ /* no 4 and 5 */
++ set_debugreg(next->debugreg[6], 6);
++ set_debugreg(next->debugreg[7], 7);
++ }
++}
++
++/*
++ * This function selects if the context switch from prev to next
++ * has to tweak the TSC disable bit in the cr4.
++ */
++static inline void disable_tsc(struct task_struct *prev_p,
++ struct task_struct *next_p)
++{
++ struct thread_info *prev, *next;
++
++ /*
++ * gcc should eliminate the ->thread_info dereference if
++ * has_secure_computing returns 0 at compile time (SECCOMP=n).
++ */
++ prev = task_thread_info(prev_p);
++ next = task_thread_info(next_p);
++
++ if (has_secure_computing(prev) || has_secure_computing(next)) {
++ /* slow path here */
++ if (has_secure_computing(prev) &&
++ !has_secure_computing(next)) {
++ write_cr4(read_cr4() & ~X86_CR4_TSD);
++ } else if (!has_secure_computing(prev) &&
++ has_secure_computing(next))
++ write_cr4(read_cr4() | X86_CR4_TSD);
++ }
++}
++
++/*
++ * switch_to(x,yn) should switch tasks from x to y.
++ *
++ * We fsave/fwait so that an exception goes off at the right time
++ * (as a call from the fsave or fwait in effect) rather than to
++ * the wrong process. Lazy FP saving no longer makes any sense
++ * with modern CPU's, and this simplifies a lot of things (SMP
++ * and UP become the same).
++ *
++ * NOTE! We used to use the x86 hardware context switching. The
++ * reason for not using it any more becomes apparent when you
++ * try to recover gracefully from saved state that is no longer
++ * valid (stale segment register values in particular). With the
++ * hardware task-switch, there is no way to fix up bad state in
++ * a reasonable manner.
++ *
++ * The fact that Intel documents the hardware task-switching to
++ * be slow is a fairly red herring - this code is not noticeably
++ * faster. However, there _is_ some room for improvement here,
++ * so the performance issues may eventually be a valid point.
++ * More important, however, is the fact that this allows us much
++ * more flexibility.
++ *
++ * The return value (in %eax) will be the "prev" task after
++ * the task-switch, and shows up in ret_from_fork in entry.S,
++ * for example.
++ */
++struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
++{
++ struct thread_struct *prev = &prev_p->thread,
++ *next = &next_p->thread;
++ int cpu = smp_processor_id();
++#ifndef CONFIG_X86_NO_TSS
++ struct tss_struct *tss = &per_cpu(init_tss, cpu);
++#endif
++ struct physdev_set_iopl iopl_op;
++ struct physdev_set_iobitmap iobmp_op;
++ multicall_entry_t _mcl[8], *mcl = _mcl;
++
++ /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
++
++ /*
++ * This is basically '__unlazy_fpu', except that we queue a
++ * multicall to indicate FPU task switch, rather than
++ * synchronously trapping to Xen.
++ */
++ if (prev_p->thread_info->status & TS_USEDFPU) {
++ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
++ mcl->op = __HYPERVISOR_fpu_taskswitch;
++ mcl->args[0] = 1;
++ mcl++;
++ }
++#if 0 /* lazy fpu sanity check */
++ else BUG_ON(!(read_cr0() & 8));
++#endif
++
++ /*
++ * Reload esp0.
++ * This is load_esp0(tss, next) with a multicall.
++ */
++ mcl->op = __HYPERVISOR_stack_switch;
++ mcl->args[0] = __KERNEL_DS;
++ mcl->args[1] = next->esp0;
++ mcl++;
++
++ /*
++ * Load the per-thread Thread-Local Storage descriptor.
++ * This is load_TLS(next, cpu) with multicalls.
++ */
++#define C(i) do { \
++ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
++ next->tls_array[i].b != prev->tls_array[i].b)) { \
++ mcl->op = __HYPERVISOR_update_descriptor; \
++ *(u64 *)&mcl->args[0] = virt_to_machine( \
++ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
++ *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \
++ mcl++; \
++ } \
++} while (0)
++ C(0); C(1); C(2);
++#undef C
++
++ if (unlikely(prev->iopl != next->iopl)) {
++ iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
++ mcl->op = __HYPERVISOR_physdev_op;
++ mcl->args[0] = PHYSDEVOP_set_iopl;
++ mcl->args[1] = (unsigned long)&iopl_op;
++ mcl++;
++ }
++
++ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
++ set_xen_guest_handle(iobmp_op.bitmap,
++ (char *)next->io_bitmap_ptr);
++ iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
++ mcl->op = __HYPERVISOR_physdev_op;
++ mcl->args[0] = PHYSDEVOP_set_iobitmap;
++ mcl->args[1] = (unsigned long)&iobmp_op;
++ mcl++;
++ }
++
++ (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
++
++ /*
++ * Restore %fs and %gs if needed.
++ *
++ * Glibc normally makes %fs be zero, and %gs is one of
++ * the TLS segments.
++ */
++ if (unlikely(next->fs))
++ loadsegment(fs, next->fs);
++
++ if (next->gs)
++ loadsegment(gs, next->gs);
++
++ /*
++ * Now maybe handle debug registers
++ */
++ if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
++ __switch_to_xtra(next_p);
++
++ disable_tsc(prev_p, next_p);
++
++ return prev_p;
++}
++
++asmlinkage int sys_fork(struct pt_regs regs)
++{
++ return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
++}
++
++asmlinkage int sys_clone(struct pt_regs regs)
++{
++ unsigned long clone_flags;
++ unsigned long newsp;
++ int __user *parent_tidptr, *child_tidptr;
++
++ clone_flags = regs.ebx;
++ newsp = regs.ecx;
++ parent_tidptr = (int __user *)regs.edx;
++ child_tidptr = (int __user *)regs.edi;
++ if (!newsp)
++ newsp = regs.esp;
++ return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
++}
++
++/*
++ * This is trivial, and on the face of it looks like it
++ * could equally well be done in user mode.
++ *
++ * Not so, for quite unobvious reasons - register pressure.
++ * In user mode vfork() cannot have a stack frame, and if
++ * done by calling the "clone()" system call directly, you
++ * do not have enough call-clobbered registers to hold all
++ * the information you need.
++ */
++asmlinkage int sys_vfork(struct pt_regs regs)
++{
++ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
++}
++
++/*
++ * sys_execve() executes a new program.
++ */
++asmlinkage int sys_execve(struct pt_regs regs)
++{
++ int error;
++ char * filename;
++
++ filename = getname((char __user *) regs.ebx);
++ error = PTR_ERR(filename);
++ if (IS_ERR(filename))
++ goto out;
++ error = do_execve(filename,
++ (char __user * __user *) regs.ecx,
++ (char __user * __user *) regs.edx,
++ &regs);
++ if (error == 0) {
++ task_lock(current);
++ current->ptrace &= ~PT_DTRACE;
++ task_unlock(current);
++ /* Make sure we don't return using sysenter.. */
++ set_thread_flag(TIF_IRET);
++ }
++ putname(filename);
++out:
++ return error;
++}
++
++#define top_esp (THREAD_SIZE - sizeof(unsigned long))
++#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
++
++unsigned long get_wchan(struct task_struct *p)
++{
++ unsigned long ebp, esp, eip;
++ unsigned long stack_page;
++ int count = 0;
++ if (!p || p == current || p->state == TASK_RUNNING)
++ return 0;
++ stack_page = (unsigned long)task_stack_page(p);
++ esp = p->thread.esp;
++ if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
++ return 0;
++ /* include/asm-i386/system.h:switch_to() pushes ebp last. */
++ ebp = *(unsigned long *) esp;
++ do {
++ if (ebp < stack_page || ebp > top_ebp+stack_page)
++ return 0;
++ eip = *(unsigned long *) (ebp+4);
++ if (!in_sched_functions(eip))
++ return eip;
++ ebp = *(unsigned long *) ebp;
++ } while (count++ < 16);
++ return 0;
++}
++
++/*
++ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
++ */
++static int get_free_idx(void)
++{
++ struct thread_struct *t = &current->thread;
++ int idx;
++
++ for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
++ if (desc_empty(t->tls_array + idx))
++ return idx + GDT_ENTRY_TLS_MIN;
++ return -ESRCH;
++}
++
++/*
++ * Set a given TLS descriptor:
++ */
++asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
++{
++ struct thread_struct *t = &current->thread;
++ struct user_desc info;
++ struct desc_struct *desc;
++ int cpu, idx;
++
++ if (copy_from_user(&info, u_info, sizeof(info)))
++ return -EFAULT;
++ idx = info.entry_number;
++
++ /*
++ * index -1 means the kernel should try to find and
++ * allocate an empty descriptor:
++ */
++ if (idx == -1) {
++ idx = get_free_idx();
++ if (idx < 0)
++ return idx;
++ if (put_user(idx, &u_info->entry_number))
++ return -EFAULT;
++ }
++
++ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
++ return -EINVAL;
++
++ desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
++
++ /*
++ * We must not get preempted while modifying the TLS.
++ */
++ cpu = get_cpu();
++
++ if (LDT_empty(&info)) {
++ desc->a = 0;
++ desc->b = 0;
++ } else {
++ desc->a = LDT_entry_a(&info);
++ desc->b = LDT_entry_b(&info);
++ }
++ load_TLS(t, cpu);
++
++ put_cpu();
++
++ return 0;
++}
++
++/*
++ * Get the current Thread-Local Storage area:
++ */
++
++#define GET_BASE(desc) ( \
++ (((desc)->a >> 16) & 0x0000ffff) | \
++ (((desc)->b << 16) & 0x00ff0000) | \
++ ( (desc)->b & 0xff000000) )
++
++#define GET_LIMIT(desc) ( \
++ ((desc)->a & 0x0ffff) | \
++ ((desc)->b & 0xf0000) )
++
++#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
++#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
++#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
++#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
++#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
++#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
++
++asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
++{
++ struct user_desc info;
++ struct desc_struct *desc;
++ int idx;
++
++ if (get_user(idx, &u_info->entry_number))
++ return -EFAULT;
++ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
++ return -EINVAL;
++
++ memset(&info, 0, sizeof(info));
++
++ desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
++
++ info.entry_number = idx;
++ info.base_addr = GET_BASE(desc);
++ info.limit = GET_LIMIT(desc);
++ info.seg_32bit = GET_32BIT(desc);
++ info.contents = GET_CONTENTS(desc);
++ info.read_exec_only = !GET_WRITABLE(desc);
++ info.limit_in_pages = GET_LIMIT_PAGES(desc);
++ info.seg_not_present = !GET_PRESENT(desc);
++ info.useable = GET_USEABLE(desc);
++
++ if (copy_to_user(u_info, &info, sizeof(info)))
++ return -EFAULT;
++ return 0;
++}
++
++unsigned long arch_align_stack(unsigned long sp)
++{
++ if (randomize_va_space)
++ sp -= get_random_int() % 8192;
++ return sp & ~0xf;
++}
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/quirks-xen.c 2007-08-27 14:02:01.000000000 -0400
+@@ -0,0 +1,47 @@
++/*
++ * This file contains work-arounds for x86 and x86_64 platform bugs.
++ */
++#include <linux/pci.h>
++#include <linux/irq.h>
++
++#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
++
++static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
++{
++ u8 config, rev;
++ u32 word;
++
++ /* BIOS may enable hardware IRQ balancing for
++ * E7520/E7320/E7525(revision ID 0x9 and below)
++ * based platforms.
++ * Disable SW irqbalance/affinity on those platforms.
++ */
++ pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
++ if (rev > 0x9)
++ return;
++
++ printk(KERN_INFO "Intel E7520/7320/7525 detected.");
++
++ /* enable access to config space*/
++ pci_read_config_byte(dev, 0xf4, &config);
++ pci_write_config_byte(dev, 0xf4, config|0x2);
++
++ /* read xTPR register */
++ raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
++
++ if (!(word & (1 << 13))) {
++ struct xen_platform_op op;
++ printk(KERN_INFO "Disabling irq balancing and affinity\n");
++ op.cmd = XENPF_platform_quirk;
++ op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
++ (void)HYPERVISOR_platform_op(&op);
++ }
++
++ /* put back the original value for config space*/
++ if (!(config & 0x2))
++ pci_write_config_byte(dev, 0xf4, config);
++}
++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
++#endif
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/setup-xen.c 2007-08-27 14:02:09.000000000 -0400
+@@ -0,0 +1,1871 @@
++/*
++ * linux/arch/i386/kernel/setup.c
++ *
++ * Copyright (C) 1995 Linus Torvalds
++ *
++ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
++ *
++ * Memory region support
++ * David Parsons <orc@pell.chi.il.us>, July-August 1999
++ *
++ * Added E820 sanitization routine (removes overlapping memory regions);
++ * Brian Moyle <bmoyle@mvista.com>, February 2001
++ *
++ * Moved CPU detection code to cpu/${cpu}.c
++ * Patrick Mochel <mochel@osdl.org>, March 2002
++ *
++ * Provisions for empty E820 memory regions (reported by certain BIOSes).
++ * Alex Achenbach <xela@slit.de>, December 2002.
++ *
++ */
++
++/*
++ * This file handles the architecture-dependent parts of initialization
++ */
++
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/mmzone.h>
++#include <linux/screen_info.h>
++#include <linux/ioport.h>
++#include <linux/acpi.h>
++#include <linux/apm_bios.h>
++#include <linux/initrd.h>
++#include <linux/bootmem.h>
++#include <linux/seq_file.h>
++#include <linux/platform_device.h>
++#include <linux/console.h>
++#include <linux/mca.h>
++#include <linux/root_dev.h>
++#include <linux/highmem.h>
++#include <linux/module.h>
++#include <linux/efi.h>
++#include <linux/init.h>
++#include <linux/edd.h>
++#include <linux/nodemask.h>
++#include <linux/kernel.h>
++#include <linux/percpu.h>
++#include <linux/notifier.h>
++#include <linux/kexec.h>
++#include <linux/crash_dump.h>
++#include <linux/dmi.h>
++#include <linux/pfn.h>
++
++#include <video/edid.h>
++
++#include <asm/apic.h>
++#include <asm/e820.h>
++#include <asm/mpspec.h>
++#include <asm/setup.h>
++#include <asm/arch_hooks.h>
++#include <asm/sections.h>
++#include <asm/io_apic.h>
++#include <asm/ist.h>
++#include <asm/io.h>
++#include <asm/hypervisor.h>
++#include <xen/interface/physdev.h>
++#include <xen/interface/memory.h>
++#include <xen/features.h>
++#include <xen/xencons.h>
++#include <setup_arch.h>
++#include <bios_ebda.h>
++
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
++
++/* Forward Declaration. */
++void __init find_max_pfn(void);
++
++static int xen_panic_event(struct notifier_block *, unsigned long, void *);
++static struct notifier_block xen_panic_block = {
++ xen_panic_event, NULL, 0 /* try to go last */
++};
++
++extern char hypercall_page[PAGE_SIZE];
++EXPORT_SYMBOL(hypercall_page);
++
++int disable_pse __devinitdata = 0;
++
++/*
++ * Machine setup..
++ */
++
++#ifdef CONFIG_EFI
++int efi_enabled = 0;
++EXPORT_SYMBOL(efi_enabled);
++#endif
++
++/* cpu data as detected by the assembly code in head.S */
++struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
++/* common cpu data for all cpus */
++struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
++EXPORT_SYMBOL(boot_cpu_data);
++
++unsigned long mmu_cr4_features;
++
++#ifdef CONFIG_ACPI
++ int acpi_disabled = 0;
++#else
++ int acpi_disabled = 1;
++#endif
++EXPORT_SYMBOL(acpi_disabled);
++
++#ifdef CONFIG_ACPI
++int __initdata acpi_force = 0;
++extern acpi_interrupt_flags acpi_sci_flags;
++#endif
++
++/* for MCA, but anyone else can use it if they want */
++unsigned int machine_id;
++#ifdef CONFIG_MCA
++EXPORT_SYMBOL(machine_id);
++#endif
++unsigned int machine_submodel_id;
++unsigned int BIOS_revision;
++unsigned int mca_pentium_flag;
++
++/* For PCI or other memory-mapped resources */
++unsigned long pci_mem_start = 0x10000000;
++#ifdef CONFIG_PCI
++EXPORT_SYMBOL(pci_mem_start);
++#endif
++
++/* Boot loader ID as an integer, for the benefit of proc_dointvec */
++int bootloader_type;
++
++/* user-defined highmem size */
++static unsigned int highmem_pages = -1;
++
++/*
++ * Setup options
++ */
++struct drive_info_struct { char dummy[32]; } drive_info;
++#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
++ defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
++EXPORT_SYMBOL(drive_info);
++#endif
++struct screen_info screen_info;
++EXPORT_SYMBOL(screen_info);
++struct apm_info apm_info;
++EXPORT_SYMBOL(apm_info);
++struct sys_desc_table_struct {
++ unsigned short length;
++ unsigned char table[0];
++};
++struct edid_info edid_info;
++EXPORT_SYMBOL_GPL(edid_info);
++struct ist_info ist_info;
++#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
++ defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
++EXPORT_SYMBOL(ist_info);
++#endif
++struct e820map e820;
++#ifdef CONFIG_XEN
++struct e820map machine_e820;
++#endif
++
++extern void early_cpu_init(void);
++extern void generic_apic_probe(char *);
++extern int root_mountflags;
++
++unsigned long saved_videomode;
++
++#define RAMDISK_IMAGE_START_MASK 0x07FF
++#define RAMDISK_PROMPT_FLAG 0x8000
++#define RAMDISK_LOAD_FLAG 0x4000
++
++static char command_line[COMMAND_LINE_SIZE];
++
++unsigned char __initdata boot_params[PARAM_SIZE];
++
++static struct resource data_resource = {
++ .name = "Kernel data",
++ .start = 0,
++ .end = 0,
++ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
++};
++
++static struct resource code_resource = {
++ .name = "Kernel code",
++ .start = 0,
++ .end = 0,
++ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
++};
++
++static struct resource system_rom_resource = {
++ .name = "System ROM",
++ .start = 0xf0000,
++ .end = 0xfffff,
++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
++};
++
++static struct resource extension_rom_resource = {
++ .name = "Extension ROM",
++ .start = 0xe0000,
++ .end = 0xeffff,
++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
++};
++
++static struct resource adapter_rom_resources[] = { {
++ .name = "Adapter ROM",
++ .start = 0xc8000,
++ .end = 0,
++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
++}, {
++ .name = "Adapter ROM",
++ .start = 0,
++ .end = 0,
++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
++}, {
++ .name = "Adapter ROM",
++ .start = 0,
++ .end = 0,
++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
++}, {
++ .name = "Adapter ROM",
++ .start = 0,
++ .end = 0,
++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
++}, {
++ .name = "Adapter ROM",
++ .start = 0,
++ .end = 0,
++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
++}, {
++ .name = "Adapter ROM",
++ .start = 0,
++ .end = 0,
++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
++} };
++
++#define ADAPTER_ROM_RESOURCES \
++ (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
++
++static struct resource video_rom_resource = {
++ .name = "Video ROM",
++ .start = 0xc0000,
++ .end = 0xc7fff,
++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
++};
++
++static struct resource video_ram_resource = {
++ .name = "Video RAM area",
++ .start = 0xa0000,
++ .end = 0xbffff,
++ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
++};
++
++static struct resource standard_io_resources[] = { {
++ .name = "dma1",
++ .start = 0x0000,
++ .end = 0x001f,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "pic1",
++ .start = 0x0020,
++ .end = 0x0021,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "timer0",
++ .start = 0x0040,
++ .end = 0x0043,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "timer1",
++ .start = 0x0050,
++ .end = 0x0053,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "keyboard",
++ .start = 0x0060,
++ .end = 0x006f,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "dma page reg",
++ .start = 0x0080,
++ .end = 0x008f,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "pic2",
++ .start = 0x00a0,
++ .end = 0x00a1,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "dma2",
++ .start = 0x00c0,
++ .end = 0x00df,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "fpu",
++ .start = 0x00f0,
++ .end = 0x00ff,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++} };
++
++#define STANDARD_IO_RESOURCES \
++ (sizeof standard_io_resources / sizeof standard_io_resources[0])
++
++#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
++
++static int __init romchecksum(unsigned char *rom, unsigned long length)
++{
++ unsigned char *p, sum = 0;
++
++ for (p = rom; p < rom + length; p++)
++ sum += *p;
++ return sum == 0;
++}
++
++static void __init probe_roms(void)
++{
++ unsigned long start, length, upper;
++ unsigned char *rom;
++ int i;
++
++#ifdef CONFIG_XEN
++ /* Nothing to do if not running in dom0. */
++ if (!is_initial_xendomain())
++ return;
++#endif
++
++ /* video rom */
++ upper = adapter_rom_resources[0].start;
++ for (start = video_rom_resource.start; start < upper; start += 2048) {
++ rom = isa_bus_to_virt(start);
++ if (!romsignature(rom))
++ continue;
++
++ video_rom_resource.start = start;
++
++ /* 0 < length <= 0x7f * 512, historically */
++ length = rom[2] * 512;
++
++ /* if checksum okay, trust length byte */
++ if (length && romchecksum(rom, length))
++ video_rom_resource.end = start + length - 1;
++
++ request_resource(&iomem_resource, &video_rom_resource);
++ break;
++ }
++
++ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
++ if (start < upper)
++ start = upper;
++
++ /* system rom */
++ request_resource(&iomem_resource, &system_rom_resource);
++ upper = system_rom_resource.start;
++
++ /* check for extension rom (ignore length byte!) */
++ rom = isa_bus_to_virt(extension_rom_resource.start);
++ if (romsignature(rom)) {
++ length = extension_rom_resource.end - extension_rom_resource.start + 1;
++ if (romchecksum(rom, length)) {
++ request_resource(&iomem_resource, &extension_rom_resource);
++ upper = extension_rom_resource.start;
++ }
++ }
++
++ /* check for adapter roms on 2k boundaries */
++ for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
++ rom = isa_bus_to_virt(start);
++ if (!romsignature(rom))
++ continue;
++
++ /* 0 < length <= 0x7f * 512, historically */
++ length = rom[2] * 512;
++
++ /* but accept any length that fits if checksum okay */
++ if (!length || start + length > upper || !romchecksum(rom, length))
++ continue;
++
++ adapter_rom_resources[i].start = start;
++ adapter_rom_resources[i].end = start + length - 1;
++ request_resource(&iomem_resource, &adapter_rom_resources[i]);
++
++ start = adapter_rom_resources[i++].end & ~2047UL;
++ }
++}
++
++/*
++ * Point at the empty zero page to start with. We map the real shared_info
++ * page as soon as fixmap is up and running.
++ */
++shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
++EXPORT_SYMBOL(HYPERVISOR_shared_info);
++
++unsigned long *phys_to_machine_mapping;
++unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
++EXPORT_SYMBOL(phys_to_machine_mapping);
++
++/* Raw start-of-day parameters from the hypervisor. */
++start_info_t *xen_start_info;
++EXPORT_SYMBOL(xen_start_info);
++
++void __init add_memory_region(unsigned long long start,
++ unsigned long long size, int type)
++{
++ int x;
++
++ if (!efi_enabled) {
++ x = e820.nr_map;
++
++ if (x == E820MAX) {
++ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
++ return;
++ }
++
++ e820.map[x].addr = start;
++ e820.map[x].size = size;
++ e820.map[x].type = type;
++ e820.nr_map++;
++ }
++} /* add_memory_region */
++
++static void __init limit_regions(unsigned long long size)
++{
++ unsigned long long current_addr = 0;
++ int i;
++
++ if (efi_enabled) {
++ efi_memory_desc_t *md;
++ void *p;
++
++ for (p = memmap.map, i = 0; p < memmap.map_end;
++ p += memmap.desc_size, i++) {
++ md = p;
++ current_addr = md->phys_addr + (md->num_pages << 12);
++ if (md->type == EFI_CONVENTIONAL_MEMORY) {
++ if (current_addr >= size) {
++ md->num_pages -=
++ (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
++ memmap.nr_map = i + 1;
++ return;
++ }
++ }
++ }
++ }
++ for (i = 0; i < e820.nr_map; i++) {
++ current_addr = e820.map[i].addr + e820.map[i].size;
++ if (current_addr < size)
++ continue;
++
++ if (e820.map[i].type != E820_RAM)
++ continue;
++
++ if (e820.map[i].addr >= size) {
++ /*
++ * This region starts past the end of the
++ * requested size, skip it completely.
++ */
++ e820.nr_map = i;
++ } else {
++ e820.nr_map = i + 1;
++ e820.map[i].size -= current_addr - size;
++ }
++ return;
++ }
++#ifdef CONFIG_XEN
++ if (i==e820.nr_map && current_addr < size) {
++ /*
++ * The e820 map finished before our requested size so
++ * extend the final entry to the requested address.
++ */
++ --i;
++ if (e820.map[i].type == E820_RAM)
++ e820.map[i].size -= current_addr - size;
++ else
++ add_memory_region(current_addr, size - current_addr, E820_RAM);
++ }
++#endif
++}
++
++#define E820_DEBUG 1
++
++static void __init print_memory_map(char *who)
++{
++ int i;
++
++ for (i = 0; i < e820.nr_map; i++) {
++ printk(" %s: %016Lx - %016Lx ", who,
++ e820.map[i].addr,
++ e820.map[i].addr + e820.map[i].size);
++ switch (e820.map[i].type) {
++ case E820_RAM: printk("(usable)\n");
++ break;
++ case E820_RESERVED:
++ printk("(reserved)\n");
++ break;
++ case E820_ACPI:
++ printk("(ACPI data)\n");
++ break;
++ case E820_NVS:
++ printk("(ACPI NVS)\n");
++ break;
++ default: printk("type %lu\n", e820.map[i].type);
++ break;
++ }
++ }
++}
++
++/*
++ * Sanitize the BIOS e820 map.
++ *
++ * Some e820 responses include overlapping entries. The following
++ * replaces the original e820 map with a new one, removing overlaps.
++ *
++ */
++struct change_member {
++ struct e820entry *pbios; /* pointer to original bios entry */
++ unsigned long long addr; /* address for this change point */
++};
++static struct change_member change_point_list[2*E820MAX] __initdata;
++static struct change_member *change_point[2*E820MAX] __initdata;
++static struct e820entry *overlap_list[E820MAX] __initdata;
++static struct e820entry new_bios[E820MAX] __initdata;
++
++int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
++{
++ struct change_member *change_tmp;
++ unsigned long current_type, last_type;
++ unsigned long long last_addr;
++ int chgidx, still_changing;
++ int overlap_entries;
++ int new_bios_entry;
++ int old_nr, new_nr, chg_nr;
++ int i;
++
++ /*
++ Visually we're performing the following (1,2,3,4 = memory types)...
++
++ Sample memory map (w/overlaps):
++ ____22__________________
++ ______________________4_
++ ____1111________________
++ _44_____________________
++ 11111111________________
++ ____________________33__
++ ___________44___________
++ __________33333_________
++ ______________22________
++ ___________________2222_
++ _________111111111______
++ _____________________11_
++ _________________4______
++
++ Sanitized equivalent (no overlap):
++ 1_______________________
++ _44_____________________
++ ___1____________________
++ ____22__________________
++ ______11________________
++ _________1______________
++ __________3_____________
++ ___________44___________
++ _____________33_________
++ _______________2________
++ ________________1_______
++ _________________4______
++ ___________________2____
++ ____________________33__
++ ______________________4_
++ */
++
++ /* if there's only one memory region, don't bother */
++ if (*pnr_map < 2)
++ return -1;
++
++ old_nr = *pnr_map;
++
++ /* bail out if we find any unreasonable addresses in bios map */
++ for (i=0; i<old_nr; i++)
++ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
++ return -1;
++
++ /* create pointers for initial change-point information (for sorting) */
++ for (i=0; i < 2*old_nr; i++)
++ change_point[i] = &change_point_list[i];
++
++ /* record all known change-points (starting and ending addresses),
++ omitting those that are for empty memory regions */
++ chgidx = 0;
++ for (i=0; i < old_nr; i++) {
++ if (biosmap[i].size != 0) {
++ change_point[chgidx]->addr = biosmap[i].addr;
++ change_point[chgidx++]->pbios = &biosmap[i];
++ change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
++ change_point[chgidx++]->pbios = &biosmap[i];
++ }
++ }
++ chg_nr = chgidx; /* true number of change-points */
++
++ /* sort change-point list by memory addresses (low -> high) */
++ still_changing = 1;
++ while (still_changing) {
++ still_changing = 0;
++ for (i=1; i < chg_nr; i++) {
++ /* if <current_addr> > <last_addr>, swap */
++ /* or, if current=<start_addr> & last=<end_addr>, swap */
++ if ((change_point[i]->addr < change_point[i-1]->addr) ||
++ ((change_point[i]->addr == change_point[i-1]->addr) &&
++ (change_point[i]->addr == change_point[i]->pbios->addr) &&
++ (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
++ )
++ {
++ change_tmp = change_point[i];
++ change_point[i] = change_point[i-1];
++ change_point[i-1] = change_tmp;
++ still_changing=1;
++ }
++ }
++ }
++
++ /* create a new bios memory map, removing overlaps */
++ overlap_entries=0; /* number of entries in the overlap table */
++ new_bios_entry=0; /* index for creating new bios map entries */
++ last_type = 0; /* start with undefined memory type */
++ last_addr = 0; /* start with 0 as last starting address */
++ /* loop through change-points, determining affect on the new bios map */
++ for (chgidx=0; chgidx < chg_nr; chgidx++)
++ {
++ /* keep track of all overlapping bios entries */
++ if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
++ {
++ /* add map entry to overlap list (> 1 entry implies an overlap) */
++ overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
++ }
++ else
++ {
++ /* remove entry from list (order independent, so swap with last) */
++ for (i=0; i<overlap_entries; i++)
++ {
++ if (overlap_list[i] == change_point[chgidx]->pbios)
++ overlap_list[i] = overlap_list[overlap_entries-1];
++ }
++ overlap_entries--;
++ }
++ /* if there are overlapping entries, decide which "type" to use */
++ /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
++ current_type = 0;
++ for (i=0; i<overlap_entries; i++)
++ if (overlap_list[i]->type > current_type)
++ current_type = overlap_list[i]->type;
++ /* continue building up new bios map based on this information */
++ if (current_type != last_type) {
++ if (last_type != 0) {
++ new_bios[new_bios_entry].size =
++ change_point[chgidx]->addr - last_addr;
++ /* move forward only if the new size was non-zero */
++ if (new_bios[new_bios_entry].size != 0)
++ if (++new_bios_entry >= E820MAX)
++ break; /* no more space left for new bios entries */
++ }
++ if (current_type != 0) {
++ new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
++ new_bios[new_bios_entry].type = current_type;
++ last_addr=change_point[chgidx]->addr;
++ }
++ last_type = current_type;
++ }
++ }
++ new_nr = new_bios_entry; /* retain count for new bios entries */
++
++ /* copy new bios mapping into original location */
++ memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
++ *pnr_map = new_nr;
++
++ return 0;
++}
++
++/*
++ * Copy the BIOS e820 map into a safe place.
++ *
++ * Sanity-check it while we're at it..
++ *
++ * If we're lucky and live on a modern system, the setup code
++ * will have given us a memory map that we can use to properly
++ * set up memory. If we aren't, we'll fake a memory map.
++ *
++ * We check to see that the memory map contains at least 2 elements
++ * before we'll use it, because the detection code in setup.S may
++ * not be perfect and most every PC known to man has two memory
++ * regions: one from 0 to 640k, and one from 1mb up. (The IBM
++ * thinkpad 560x, for example, does not cooperate with the memory
++ * detection code.)
++ */
++int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
++{
++#ifndef CONFIG_XEN
++ /* Only one memory region (or negative)? Ignore it */
++ if (nr_map < 2)
++ return -1;
++#else
++ BUG_ON(nr_map < 1);
++#endif
++
++ do {
++ unsigned long long start = biosmap->addr;
++ unsigned long long size = biosmap->size;
++ unsigned long long end = start + size;
++ unsigned long type = biosmap->type;
++
++ /* Overflow in 64 bits? Ignore the memory map. */
++ if (start > end)
++ return -1;
++
++#ifndef CONFIG_XEN
++ /*
++ * Some BIOSes claim RAM in the 640k - 1M region.
++ * Not right. Fix it up.
++ */
++ if (type == E820_RAM) {
++ if (start < 0x100000ULL && end > 0xA0000ULL) {
++ if (start < 0xA0000ULL)
++ add_memory_region(start, 0xA0000ULL-start, type);
++ if (end <= 0x100000ULL)
++ continue;
++ start = 0x100000ULL;
++ size = end - start;
++ }
++ }
++#endif
++ add_memory_region(start, size, type);
++ } while (biosmap++,--nr_map);
++ return 0;
++}
++
++#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
++struct edd edd;
++#ifdef CONFIG_EDD_MODULE
++EXPORT_SYMBOL(edd);
++#endif
++/**
++ * copy_edd() - Copy the BIOS EDD information
++ * from boot_params into a safe place.
++ *
++ */
++static inline void copy_edd(void)
++{
++ memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
++ memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
++ edd.mbr_signature_nr = EDD_MBR_SIG_NR;
++ edd.edd_info_nr = EDD_NR;
++}
++#else
++static inline void copy_edd(void)
++{
++}
++#endif
++
++static void __init parse_cmdline_early (char ** cmdline_p)
++{
++ char c = ' ', *to = command_line, *from = saved_command_line;
++ int len = 0, max_cmdline;
++ int userdef = 0;
++
++ if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
++ max_cmdline = COMMAND_LINE_SIZE;
++ memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
++ /* Save unparsed command line copy for /proc/cmdline */
++ saved_command_line[max_cmdline-1] = '\0';
++
++ for (;;) {
++ if (c != ' ')
++ goto next_char;
++ /*
++ * "mem=nopentium" disables the 4MB page tables.
++ * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
++ * to <mem>, overriding the bios size.
++ * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
++ * <start> to <start>+<mem>, overriding the bios size.
++ *
++ * HPA tells me bootloaders need to parse mem=, so no new
++ * option should be mem= [also see Documentation/i386/boot.txt]
++ */
++ if (!memcmp(from, "mem=", 4)) {
++ if (to != command_line)
++ to--;
++ if (!memcmp(from+4, "nopentium", 9)) {
++ from += 9+4;
++ clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
++ disable_pse = 1;
++ } else {
++ /* If the user specifies memory size, we
++ * limit the BIOS-provided memory map to
++ * that size. exactmap can be used to specify
++ * the exact map. mem=number can be used to
++ * trim the existing memory map.
++ */
++ unsigned long long mem_size;
++
++ mem_size = memparse(from+4, &from);
++ limit_regions(mem_size);
++ userdef=1;
++ }
++ }
++
++ else if (!memcmp(from, "memmap=", 7)) {
++ if (to != command_line)
++ to--;
++ if (!memcmp(from+7, "exactmap", 8)) {
++#ifdef CONFIG_CRASH_DUMP
++ /* If we are doing a crash dump, we
++ * still need to know the real mem
++ * size before original memory map is
++ * reset.
++ */
++ find_max_pfn();
++ saved_max_pfn = max_pfn;
++#endif
++ from += 8+7;
++ e820.nr_map = 0;
++ userdef = 1;
++ } else {
++ /* If the user specifies memory size, we
++ * limit the BIOS-provided memory map to
++ * that size. exactmap can be used to specify
++ * the exact map. mem=number can be used to
++ * trim the existing memory map.
++ */
++ unsigned long long start_at, mem_size;
++
++ mem_size = memparse(from+7, &from);
++ if (*from == '@') {
++ start_at = memparse(from+1, &from);
++ add_memory_region(start_at, mem_size, E820_RAM);
++ } else if (*from == '#') {
++ start_at = memparse(from+1, &from);
++ add_memory_region(start_at, mem_size, E820_ACPI);
++ } else if (*from == '$') {
++ start_at = memparse(from+1, &from);
++ add_memory_region(start_at, mem_size, E820_RESERVED);
++ } else {
++ limit_regions(mem_size);
++ userdef=1;
++ }
++ }
++ }
++
++ else if (!memcmp(from, "noexec=", 7))
++ noexec_setup(from + 7);
++
++
++#ifdef CONFIG_X86_MPPARSE
++ /*
++ * If the BIOS enumerates physical processors before logical,
++ * maxcpus=N at enumeration-time can be used to disable HT.
++ */
++ else if (!memcmp(from, "maxcpus=", 8)) {
++ extern unsigned int maxcpus;
++
++ maxcpus = simple_strtoul(from + 8, NULL, 0);
++ }
++#endif
++
++#ifdef CONFIG_ACPI
++ /* "acpi=off" disables both ACPI table parsing and interpreter */
++ else if (!memcmp(from, "acpi=off", 8)) {
++ disable_acpi();
++ }
++
++ /* acpi=force to over-ride black-list */
++ else if (!memcmp(from, "acpi=force", 10)) {
++ acpi_force = 1;
++ acpi_ht = 1;
++ acpi_disabled = 0;
++ }
++
++ /* acpi=strict disables out-of-spec workarounds */
++ else if (!memcmp(from, "acpi=strict", 11)) {
++ acpi_strict = 1;
++ }
++
++ /* Limit ACPI just to boot-time to enable HT */
++ else if (!memcmp(from, "acpi=ht", 7)) {
++ if (!acpi_force)
++ disable_acpi();
++ acpi_ht = 1;
++ }
++
++ /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
++ else if (!memcmp(from, "pci=noacpi", 10)) {
++ acpi_disable_pci();
++ }
++ /* "acpi=noirq" disables ACPI interrupt routing */
++ else if (!memcmp(from, "acpi=noirq", 10)) {
++ acpi_noirq_set();
++ }
++
++ else if (!memcmp(from, "acpi_sci=edge", 13))
++ acpi_sci_flags.trigger = 1;
++
++ else if (!memcmp(from, "acpi_sci=level", 14))
++ acpi_sci_flags.trigger = 3;
++
++ else if (!memcmp(from, "acpi_sci=high", 13))
++ acpi_sci_flags.polarity = 1;
++
++ else if (!memcmp(from, "acpi_sci=low", 12))
++ acpi_sci_flags.polarity = 3;
++
++#ifdef CONFIG_X86_IO_APIC
++ else if (!memcmp(from, "acpi_skip_timer_override", 24))
++ acpi_skip_timer_override = 1;
++
++ if (!memcmp(from, "disable_timer_pin_1", 19))
++ disable_timer_pin_1 = 1;
++ if (!memcmp(from, "enable_timer_pin_1", 18))
++ disable_timer_pin_1 = -1;
++
++ /* disable IO-APIC */
++ else if (!memcmp(from, "noapic", 6))
++ disable_ioapic_setup();
++#endif /* CONFIG_X86_IO_APIC */
++#endif /* CONFIG_ACPI */
++
++#ifdef CONFIG_X86_LOCAL_APIC
++ /* enable local APIC */
++ else if (!memcmp(from, "lapic", 5))
++ lapic_enable();
++
++ /* disable local APIC */
++ else if (!memcmp(from, "nolapic", 6))
++ lapic_disable();
++#endif /* CONFIG_X86_LOCAL_APIC */
++
++#ifdef CONFIG_KEXEC
++ /* crashkernel=size@addr specifies the location to reserve for
++ * a crash kernel. By reserving this memory we guarantee
++ * that linux never set's it up as a DMA target.
++ * Useful for holding code to do something appropriate
++ * after a kernel panic.
++ */
++ else if (!memcmp(from, "crashkernel=", 12)) {
++#ifndef CONFIG_XEN
++ unsigned long size, base;
++ size = memparse(from+12, &from);
++ if (*from == '@') {
++ base = memparse(from+1, &from);
++ /* FIXME: Do I want a sanity check
++ * to validate the memory range?
++ */
++ crashk_res.start = base;
++ crashk_res.end = base + size - 1;
++ }
++#else
++ printk("Ignoring crashkernel command line, "
++ "parameter will be supplied by xen\n");
++#endif
++ }
++#endif
++#ifdef CONFIG_PROC_VMCORE
++ /* elfcorehdr= specifies the location of elf core header
++ * stored by the crashed kernel.
++ */
++ else if (!memcmp(from, "elfcorehdr=", 11))
++ elfcorehdr_addr = memparse(from+11, &from);
++#endif
++
++ /*
++ * highmem=size forces highmem to be exactly 'size' bytes.
++ * This works even on boxes that have no highmem otherwise.
++ * This also works to reduce highmem size on bigger boxes.
++ */
++ else if (!memcmp(from, "highmem=", 8))
++ highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
++
++ /*
++ * vmalloc=size forces the vmalloc area to be exactly 'size'
++ * bytes. This can be used to increase (or decrease) the
++ * vmalloc area - the default is 128m.
++ */
++ else if (!memcmp(from, "vmalloc=", 8))
++ __VMALLOC_RESERVE = memparse(from+8, &from);
++
++ next_char:
++ c = *(from++);
++ if (!c)
++ break;
++ if (COMMAND_LINE_SIZE <= ++len)
++ break;
++ *(to++) = c;
++ }
++ *to = '\0';
++ *cmdline_p = command_line;
++ if (userdef) {
++ printk(KERN_INFO "user-defined physical RAM map:\n");
++ print_memory_map("user");
++ }
++}
++
++/*
++ * Callback for efi_memory_walk.
++ */
++static int __init
++efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
++{
++ unsigned long *max_pfn = arg, pfn;
++
++ if (start < end) {
++ pfn = PFN_UP(end -1);
++ if (pfn > *max_pfn)
++ *max_pfn = pfn;
++ }
++ return 0;
++}
++
++static int __init
++efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
++{
++ memory_present(0, start, end);
++ return 0;
++}
++
++ /*
++ * This function checks if the entire range <start,end> is mapped with type.
++ *
++ * Note: this function only works correct if the e820 table is sorted and
++ * not-overlapping, which is the case
++ */
++int __init
++e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
++{
++ u64 start = s;
++ u64 end = e;
++ int i;
++
++#ifndef CONFIG_XEN
++ for (i = 0; i < e820.nr_map; i++) {
++ struct e820entry *ei = &e820.map[i];
++#else
++ if (!is_initial_xendomain())
++ return 0;
++ for (i = 0; i < machine_e820.nr_map; ++i) {
++ const struct e820entry *ei = &machine_e820.map[i];
++#endif
++ if (type && ei->type != type)
++ continue;
++ /* is the region (part) in overlap with the current region ?*/
++ if (ei->addr >= end || ei->addr + ei->size <= start)
++ continue;
++ /* if the region is at the beginning of <start,end> we move
++ * start to the end of the region since it's ok until there
++ */
++ if (ei->addr <= start)
++ start = ei->addr + ei->size;
++ /* if start is now at or beyond end, we're done, full
++ * coverage */
++ if (start >= end)
++ return 1; /* we're done */
++ }
++ return 0;
++}
++
++/*
++ * Find the highest page frame number we have available
++ */
++void __init find_max_pfn(void)
++{
++ int i;
++
++ max_pfn = 0;
++ if (efi_enabled) {
++ efi_memmap_walk(efi_find_max_pfn, &max_pfn);
++ efi_memmap_walk(efi_memory_present_wrapper, NULL);
++ return;
++ }
++
++ for (i = 0; i < e820.nr_map; i++) {
++ unsigned long start, end;
++ /* RAM? */
++ if (e820.map[i].type != E820_RAM)
++ continue;
++ start = PFN_UP(e820.map[i].addr);
++ end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
++ if (start >= end)
++ continue;
++ if (end > max_pfn)
++ max_pfn = end;
++ memory_present(0, start, end);
++ }
++}
++
++/*
++ * Determine low and high memory ranges:
++ */
++unsigned long __init find_max_low_pfn(void)
++{
++ unsigned long max_low_pfn;
++
++ max_low_pfn = max_pfn;
++ if (max_low_pfn > MAXMEM_PFN) {
++ if (highmem_pages == -1)
++ highmem_pages = max_pfn - MAXMEM_PFN;
++ if (highmem_pages + MAXMEM_PFN < max_pfn)
++ max_pfn = MAXMEM_PFN + highmem_pages;
++ if (highmem_pages + MAXMEM_PFN > max_pfn) {
++ printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
++ highmem_pages = 0;
++ }
++ max_low_pfn = MAXMEM_PFN;
++#ifndef CONFIG_HIGHMEM
++ /* Maximum memory usable is what is directly addressable */
++ printk(KERN_WARNING "Warning only %ldMB will be used.\n",
++ MAXMEM>>20);
++ if (max_pfn > MAX_NONPAE_PFN)
++ printk(KERN_WARNING "Use a PAE enabled kernel.\n");
++ else
++ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
++ max_pfn = MAXMEM_PFN;
++#else /* !CONFIG_HIGHMEM */
++#ifndef CONFIG_X86_PAE
++ if (max_pfn > MAX_NONPAE_PFN) {
++ max_pfn = MAX_NONPAE_PFN;
++ printk(KERN_WARNING "Warning only 4GB will be used.\n");
++ printk(KERN_WARNING "Use a PAE enabled kernel.\n");
++ }
++#endif /* !CONFIG_X86_PAE */
++#endif /* !CONFIG_HIGHMEM */
++ } else {
++ if (highmem_pages == -1)
++ highmem_pages = 0;
++#ifdef CONFIG_HIGHMEM
++ if (highmem_pages >= max_pfn) {
++ printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
++ highmem_pages = 0;
++ }
++ if (highmem_pages) {
++ if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
++ printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
++ highmem_pages = 0;
++ }
++ max_low_pfn -= highmem_pages;
++ }
++#else
++ if (highmem_pages)
++ printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
++#endif
++ }
++ return max_low_pfn;
++}
++
++/*
++ * Free all available memory for boot time allocation. Used
++ * as a callback function by efi_memory_walk()
++ */
++
++static int __init
++free_available_memory(unsigned long start, unsigned long end, void *arg)
++{
++ /* check max_low_pfn */
++ if (start >= (max_low_pfn << PAGE_SHIFT))
++ return 0;
++ if (end >= (max_low_pfn << PAGE_SHIFT))
++ end = max_low_pfn << PAGE_SHIFT;
++ if (start < end)
++ free_bootmem(start, end - start);
++
++ return 0;
++}
++/*
++ * Register fully available low RAM pages with the bootmem allocator.
++ */
++static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
++{
++ int i;
++
++ if (efi_enabled) {
++ efi_memmap_walk(free_available_memory, NULL);
++ return;
++ }
++ for (i = 0; i < e820.nr_map; i++) {
++ unsigned long curr_pfn, last_pfn, size;
++ /*
++ * Reserve usable low memory
++ */
++ if (e820.map[i].type != E820_RAM)
++ continue;
++ /*
++ * We are rounding up the start address of usable memory:
++ */
++ curr_pfn = PFN_UP(e820.map[i].addr);
++ if (curr_pfn >= max_low_pfn)
++ continue;
++ /*
++ * ... and at the end of the usable range downwards:
++ */
++ last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
++
++#ifdef CONFIG_XEN
++ /*
++ * Truncate to the number of actual pages currently
++ * present.
++ */
++ if (last_pfn > xen_start_info->nr_pages)
++ last_pfn = xen_start_info->nr_pages;
++#endif
++
++ if (last_pfn > max_low_pfn)
++ last_pfn = max_low_pfn;
++
++ /*
++ * .. finally, did all the rounding and playing
++ * around just make the area go away?
++ */
++ if (last_pfn <= curr_pfn)
++ continue;
++
++ size = last_pfn - curr_pfn;
++ free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
++ }
++}
++
++#ifndef CONFIG_XEN
++/*
++ * workaround for Dell systems that neglect to reserve EBDA
++ */
++static void __init reserve_ebda_region(void)
++{
++ unsigned int addr;
++ addr = get_bios_ebda();
++ if (addr)
++ reserve_bootmem(addr, PAGE_SIZE);
++}
++#endif
++
++#ifndef CONFIG_NEED_MULTIPLE_NODES
++void __init setup_bootmem_allocator(void);
++static unsigned long __init setup_memory(void)
++{
++ /*
++ * partially used pages are not usable - thus
++ * we are rounding upwards:
++ */
++ min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
++ xen_start_info->nr_pt_frames;
++
++ find_max_pfn();
++
++ max_low_pfn = find_max_low_pfn();
++
++#ifdef CONFIG_HIGHMEM
++ highstart_pfn = highend_pfn = max_pfn;
++ if (max_pfn > max_low_pfn) {
++ highstart_pfn = max_low_pfn;
++ }
++ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
++ pages_to_mb(highend_pfn - highstart_pfn));
++#endif
++ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
++ pages_to_mb(max_low_pfn));
++
++ setup_bootmem_allocator();
++
++ return max_low_pfn;
++}
++
++void __init zone_sizes_init(void)
++{
++ unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
++ unsigned int max_dma, low;
++
++ /*
++ * XEN: Our notion of "DMA memory" is fake when running over Xen.
++ * We simply put all RAM in the DMA zone so that those drivers which
++ * needlessly specify GFP_DMA do not get starved of RAM unnecessarily.
++ * Those drivers that *do* require lowmem are screwed anyway when
++ * running over Xen!
++ */
++ max_dma = max_low_pfn;
++ low = max_low_pfn;
++
++ if (low < max_dma)
++ zones_size[ZONE_DMA] = low;
++ else {
++ zones_size[ZONE_DMA] = max_dma;
++ zones_size[ZONE_NORMAL] = low - max_dma;
++#ifdef CONFIG_HIGHMEM
++ zones_size[ZONE_HIGHMEM] = highend_pfn - low;
++#endif
++ }
++ free_area_init(zones_size);
++}
++#else
++extern unsigned long __init setup_memory(void);
++extern void zone_sizes_init(void);
++#endif /* !CONFIG_NEED_MULTIPLE_NODES */
++
++void __init setup_bootmem_allocator(void)
++{
++ unsigned long bootmap_size;
++ /*
++ * Initialize the boot-time allocator (with low memory only):
++ */
++ bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
++
++ register_bootmem_low_pages(max_low_pfn);
++
++ /*
++ * Reserve the bootmem bitmap itself as well. We do this in two
++ * steps (first step was init_bootmem()) because this catches
++ * the (very unlikely) case of us accidentally initializing the
++ * bootmem allocator with an invalid RAM area.
++ */
++ reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
++ bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
++
++#ifndef CONFIG_XEN
++ /*
++ * reserve physical page 0 - it's a special BIOS page on many boxes,
++ * enabling clean reboots, SMP operation, laptop functions.
++ */
++ reserve_bootmem(0, PAGE_SIZE);
++
++ /* reserve EBDA region, it's a 4K region */
++ reserve_ebda_region();
++
++ /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
++ PCI prefetch into it (errata #56). Usually the page is reserved anyways,
++ unless you have no PS/2 mouse plugged in. */
++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
++ boot_cpu_data.x86 == 6)
++ reserve_bootmem(0xa0000 - 4096, 4096);
++
++#ifdef CONFIG_SMP
++ /*
++ * But first pinch a few for the stack/trampoline stuff
++ * FIXME: Don't need the extra page at 4K, but need to fix
++ * trampoline before removing it. (see the GDT stuff)
++ */
++ reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
++#endif
++#ifdef CONFIG_ACPI_SLEEP
++ /*
++ * Reserve low memory region for sleep support.
++ */
++ acpi_reserve_bootmem();
++#endif
++#endif /* !CONFIG_XEN */
++
++#ifdef CONFIG_BLK_DEV_INITRD
++ if (xen_start_info->mod_start) {
++ if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
++ /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
++ initrd_start = INITRD_START + PAGE_OFFSET;
++ initrd_end = initrd_start+INITRD_SIZE;
++ initrd_below_start_ok = 1;
++ }
++ else {
++ printk(KERN_ERR "initrd extends beyond end of memory "
++ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
++ INITRD_START + INITRD_SIZE,
++ max_low_pfn << PAGE_SHIFT);
++ initrd_start = 0;
++ }
++ }
++#endif
++#ifdef CONFIG_KEXEC
++#ifdef CONFIG_XEN
++ xen_machine_kexec_setup_resources();
++#else
++ if (crashk_res.start != crashk_res.end)
++ reserve_bootmem(crashk_res.start,
++ crashk_res.end - crashk_res.start + 1);
++#endif
++#endif
++
++ if (!xen_feature(XENFEAT_auto_translated_physmap))
++ phys_to_machine_mapping =
++ (unsigned long *)xen_start_info->mfn_list;
++}
++
++/*
++ * The node 0 pgdat is initialized before all of these because
++ * it's needed for bootmem. node>0 pgdats have their virtual
++ * space allocated before the pagetables are in place to access
++ * them, so they can't be cleared then.
++ *
++ * This should all compile down to nothing when NUMA is off.
++ */
++void __init remapped_pgdat_init(void)
++{
++ int nid;
++
++ for_each_online_node(nid) {
++ if (nid != 0)
++ memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
++ }
++}
++
++/*
++ * Request address space for all standard RAM and ROM resources
++ * and also for regions reported as reserved by the e820.
++ */
++static void __init
++legacy_init_iomem_resources(struct e820entry *e820, int nr_map,
++ struct resource *code_resource,
++ struct resource *data_resource)
++{
++ int i;
++
++ probe_roms();
++
++ for (i = 0; i < nr_map; i++) {
++ struct resource *res;
++#ifndef CONFIG_RESOURCES_64BIT
++ if (e820[i].addr + e820[i].size > 0x100000000ULL)
++ continue;
++#endif
++ res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
++ switch (e820[i].type) {
++ case E820_RAM: res->name = "System RAM"; break;
++ case E820_ACPI: res->name = "ACPI Tables"; break;
++ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
++ default: res->name = "reserved";
++ }
++ res->start = e820[i].addr;
++ res->end = res->start + e820[i].size - 1;
++ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
++ if (request_resource(&iomem_resource, res)) {
++ kfree(res);
++ continue;
++ }
++ if (e820[i].type == E820_RAM) {
++ /*
++ * We don't know which RAM region contains kernel data,
++ * so we try it repeatedly and let the resource manager
++ * test it.
++ */
++#ifndef CONFIG_XEN
++ request_resource(res, code_resource);
++ request_resource(res, data_resource);
++#endif
++#ifdef CONFIG_KEXEC
++ if (crashk_res.start != crashk_res.end)
++ request_resource(res, &crashk_res);
++#ifdef CONFIG_XEN
++ xen_machine_kexec_register_resources(res);
++#endif
++#endif
++ }
++ }
++}
++
++/*
++ * Locate a unused range of the physical address space below 4G which
++ * can be used for PCI mappings.
++ */
++static void __init
++e820_setup_gap(struct e820entry *e820, int nr_map)
++{
++ unsigned long gapstart, gapsize, round;
++ unsigned long long last;
++ int i;
++
++ /*
++ * Search for the bigest gap in the low 32 bits of the e820
++ * memory space.
++ */
++ last = 0x100000000ull;
++ gapstart = 0x10000000;
++ gapsize = 0x400000;
++ i = nr_map;
++ while (--i >= 0) {
++ unsigned long long start = e820[i].addr;
++ unsigned long long end = start + e820[i].size;
++
++ /*
++ * Since "last" is at most 4GB, we know we'll
++ * fit in 32 bits if this condition is true
++ */
++ if (last > end) {
++ unsigned long gap = last - end;
++
++ if (gap > gapsize) {
++ gapsize = gap;
++ gapstart = end;
++ }
++ }
++ if (start < last)
++ last = start;
++ }
++
++ /*
++ * See how much we want to round up: start off with
++ * rounding to the next 1MB area.
++ */
++ round = 0x100000;
++ while ((gapsize >> 4) > round)
++ round += round;
++ /* Fun with two's complement */
++ pci_mem_start = (gapstart + round) & -round;
++
++ printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
++ pci_mem_start, gapstart, gapsize);
++}
++
++/*
++ * Request address space for all standard resources
++ *
++ * This is called just before pcibios_init(), which is also a
++ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
++ */
++static int __init request_standard_resources(void)
++{
++ int i;
++
++ /* Nothing to do if not running in dom0. */
++ if (!is_initial_xendomain())
++ return 0;
++
++ printk("Setting up standard PCI resources\n");
++#ifdef CONFIG_XEN
++ legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map,
++ &code_resource, &data_resource);
++#else
++ if (efi_enabled)
++ efi_initialize_iomem_resources(&code_resource, &data_resource);
++ else
++ legacy_init_iomem_resources(e820.map, e820.nr_map,
++ &code_resource, &data_resource);
++#endif
++
++ /* EFI systems may still have VGA */
++ request_resource(&iomem_resource, &video_ram_resource);
++
++ /* request I/O space for devices used on all i[345]86 PCs */
++ for (i = 0; i < STANDARD_IO_RESOURCES; i++)
++ request_resource(&ioport_resource, &standard_io_resources[i]);
++ return 0;
++}
++
++subsys_initcall(request_standard_resources);
++
++static void __init register_memory(void)
++{
++#ifdef CONFIG_XEN
++ if (is_initial_xendomain()) {
++ struct xen_memory_map memmap;
++
++ memmap.nr_entries = E820MAX;
++ set_xen_guest_handle(memmap.buffer, machine_e820.map);
++
++ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
++ BUG();
++
++ machine_e820.nr_map = memmap.nr_entries;
++ e820_setup_gap(machine_e820.map, machine_e820.nr_map);
++ }
++ else
++#endif
++ e820_setup_gap(e820.map, e820.nr_map);
++}
++
++#ifdef CONFIG_MCA
++static void set_mca_bus(int x)
++{
++ MCA_bus = x;
++}
++#else
++static void set_mca_bus(int x) { }
++#endif
++
++/*
++ * Determine if we were loaded by an EFI loader. If so, then we have also been
++ * passed the efi memmap, systab, etc., so we should use these data structures
++ * for initialization. Note, the efi init code path is determined by the
++ * global efi_enabled. This allows the same kernel image to be used on existing
++ * systems (with a traditional BIOS) as well as on EFI systems.
++ */
++void __init setup_arch(char **cmdline_p)
++{
++ int i, j, k, fpp;
++ struct physdev_set_iopl set_iopl;
++ unsigned long max_low_pfn;
++
++ /* Force a quick death if the kernel panics (not domain 0). */
++ extern int panic_timeout;
++ if (!panic_timeout && !is_initial_xendomain())
++ panic_timeout = 1;
++
++ /* Register a call for panic conditions. */
++ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
++
++ HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
++ HYPERVISOR_vm_assist(VMASST_CMD_enable,
++ VMASST_TYPE_writable_pagetables);
++
++ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
++ early_cpu_init();
++#ifdef CONFIG_SMP
++ prefill_possible_map();
++#endif
++
++ /*
++ * FIXME: This isn't an official loader_type right
++ * now but does currently work with elilo.
++ * If we were configured as an EFI kernel, check to make
++ * sure that we were loaded correctly from elilo and that
++ * the system table is valid. If not, then initialize normally.
++ */
++#ifdef CONFIG_EFI
++ if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
++ efi_enabled = 1;
++#endif
++
++ /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
++ properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
++ */
++ ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
++ drive_info = DRIVE_INFO;
++ screen_info = SCREEN_INFO;
++ edid_info = EDID_INFO;
++ apm_info.bios = APM_BIOS_INFO;
++ ist_info = IST_INFO;
++ saved_videomode = VIDEO_MODE;
++ if( SYS_DESC_TABLE.length != 0 ) {
++ set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
++ machine_id = SYS_DESC_TABLE.table[0];
++ machine_submodel_id = SYS_DESC_TABLE.table[1];
++ BIOS_revision = SYS_DESC_TABLE.table[2];
++ }
++ bootloader_type = LOADER_TYPE;
++
++ if (is_initial_xendomain()) {
++ /* This is drawn from a dump from vgacon:startup in
++ * standard Linux. */
++ screen_info.orig_video_mode = 3;
++ screen_info.orig_video_isVGA = 1;
++ screen_info.orig_video_lines = 25;
++ screen_info.orig_video_cols = 80;
++ screen_info.orig_video_ega_bx = 3;
++ screen_info.orig_video_points = 16;
++ screen_info.orig_y = screen_info.orig_video_lines - 1;
++ if (xen_start_info->console.dom0.info_size >=
++ sizeof(struct dom0_vga_console_info)) {
++ const struct dom0_vga_console_info *info =
++ (struct dom0_vga_console_info *)(
++ (char *)xen_start_info +
++ xen_start_info->console.dom0.info_off);
++ dom0_init_screen_info(info);
++ }
++ xen_start_info->console.domU.mfn = 0;
++ xen_start_info->console.domU.evtchn = 0;
++ } else
++ screen_info.orig_video_isVGA = 0;
++
++#ifdef CONFIG_BLK_DEV_RAM
++ rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
++ rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
++ rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
++#endif
++
++ setup_xen_features();
++
++ ARCH_SETUP
++ if (efi_enabled)
++ efi_init();
++ else {
++ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
++ print_memory_map(machine_specific_memory_setup());
++ }
++
++ copy_edd();
++
++ if (!MOUNT_ROOT_RDONLY)
++ root_mountflags &= ~MS_RDONLY;
++ init_mm.start_code = (unsigned long) _text;
++ init_mm.end_code = (unsigned long) _etext;
++ init_mm.end_data = (unsigned long) _edata;
++ init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
++ xen_start_info->nr_pt_frames) << PAGE_SHIFT;
++
++ code_resource.start = virt_to_phys(_text);
++ code_resource.end = virt_to_phys(_etext)-1;
++ data_resource.start = virt_to_phys(_etext);
++ data_resource.end = virt_to_phys(_edata)-1;
++
++ parse_cmdline_early(cmdline_p);
++
++#ifdef CONFIG_EARLY_PRINTK
++ {
++ char *s = strstr(*cmdline_p, "earlyprintk=");
++ if (s) {
++ setup_early_printk(strchr(s, '=') + 1);
++ printk("early console enabled\n");
++ }
++ }
++#endif
++
++ max_low_pfn = setup_memory();
++
++ /*
++ * NOTE: before this point _nobody_ is allowed to allocate
++ * any memory using the bootmem allocator. Although the
++ * alloctor is now initialised only the first 8Mb of the kernel
++ * virtual address space has been mapped. All allocations before
++ * paging_init() has completed must use the alloc_bootmem_low_pages()
++ * variant (which allocates DMA'able memory) and care must be taken
++ * not to exceed the 8Mb limit.
++ */
++
++#ifdef CONFIG_SMP
++ smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
++#endif
++ paging_init();
++ remapped_pgdat_init();
++ sparse_init();
++ zone_sizes_init();
++
++#ifdef CONFIG_X86_FIND_SMP_CONFIG
++ /*
++ * Find and reserve possible boot-time SMP configuration:
++ */
++ find_smp_config();
++#endif
++
++ /* Make sure we have a correctly sized P->M table. */
++ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++ phys_to_machine_mapping = alloc_bootmem_low_pages(
++ max_pfn * sizeof(unsigned long));
++ memset(phys_to_machine_mapping, ~0,
++ max_pfn * sizeof(unsigned long));
++ memcpy(phys_to_machine_mapping,
++ (unsigned long *)xen_start_info->mfn_list,
++ xen_start_info->nr_pages * sizeof(unsigned long));
++ free_bootmem(
++ __pa(xen_start_info->mfn_list),
++ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
++ sizeof(unsigned long))));
++
++ /*
++ * Initialise the list of the frames that specify the list of
++ * frames that make up the p2m table. Used by save/restore
++ */
++ pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
++
++ fpp = PAGE_SIZE/sizeof(unsigned long);
++ for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
++ if ((j % fpp) == 0) {
++ k++;
++ BUG_ON(k>=16);
++ pfn_to_mfn_frame_list[k] =
++ alloc_bootmem_low_pages(PAGE_SIZE);
++ pfn_to_mfn_frame_list_list[k] =
++ virt_to_mfn(pfn_to_mfn_frame_list[k]);
++ j=0;
++ }
++ pfn_to_mfn_frame_list[k][j] =
++ virt_to_mfn(&phys_to_machine_mapping[i]);
++ }
++ HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
++ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
++ virt_to_mfn(pfn_to_mfn_frame_list_list);
++ }
++
++ /*
++ * NOTE: at this point the bootmem allocator is fully available.
++ */
++
++ if (is_initial_xendomain())
++ dmi_scan_machine();
++
++#ifdef CONFIG_X86_GENERICARCH
++ generic_apic_probe(*cmdline_p);
++#endif
++ if (efi_enabled)
++ efi_map_memmap();
++
++ set_iopl.iopl = 1;
++ HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
++
++#ifdef CONFIG_ACPI
++ if (!is_initial_xendomain()) {
++ printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
++ acpi_disabled = 1;
++ acpi_ht = 0;
++ }
++
++ /*
++ * Parse the ACPI tables for possible boot-time SMP configuration.
++ */
++ acpi_boot_table_init();
++#endif
++
++#ifdef CONFIG_X86_IO_APIC
++ check_acpi_pci(); /* Checks more than just ACPI actually */
++#endif
++
++#ifdef CONFIG_ACPI
++ acpi_boot_init();
++
++#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
++ if (def_to_bigsmp)
++ printk(KERN_WARNING "More than 8 CPUs detected and "
++ "CONFIG_X86_PC cannot handle it.\nUse "
++ "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
++#endif
++#endif
++#ifdef CONFIG_X86_LOCAL_APIC
++ if (smp_found_config)
++ get_smp_config();
++#endif
++
++ register_memory();
++
++ if (is_initial_xendomain()) {
++#ifdef CONFIG_VT
++#if defined(CONFIG_VGA_CONSOLE)
++ if (!efi_enabled ||
++ (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
++ conswitchp = &vga_con;
++#elif defined(CONFIG_DUMMY_CONSOLE)
++ conswitchp = &dummy_con;
++#endif
++#endif
++ } else {
++#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
++ conswitchp = &dummy_con;
++#endif
++ }
++ tsc_init();
++
++ xencons_early_setup();
++}
++
++static int
++xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
++{
++ HYPERVISOR_shutdown(SHUTDOWN_crash);
++ /* we're never actually going to get here... */
++ return NOTIFY_DONE;
++}
++
++static __init int add_pcspkr(void)
++{
++ struct platform_device *pd;
++ int ret;
++
++ if (!is_initial_xendomain())
++ return 0;
++
++ pd = platform_device_alloc("pcspkr", -1);
++ if (!pd)
++ return -ENOMEM;
++
++ ret = platform_device_add(pd);
++ if (ret)
++ platform_device_put(pd);
++
++ return ret;
++}
++device_initcall(add_pcspkr);
++
++/*
++ * Local Variables:
++ * mode:c
++ * c-file-style:"k&r"
++ * c-basic-offset:8
++ * End:
++ */
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/smp-xen.c 2007-08-27 14:02:03.000000000 -0400
+@@ -0,0 +1,624 @@
++/*
++ * Intel SMP support routines.
++ *
++ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
++ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
++ *
++ * This code is released under the GNU General Public License version 2 or
++ * later.
++ */
++
++#include <linux/init.h>
++
++#include <linux/mm.h>
++#include <linux/delay.h>
++#include <linux/spinlock.h>
++#include <linux/smp_lock.h>
++#include <linux/kernel_stat.h>
++#include <linux/mc146818rtc.h>
++#include <linux/cache.h>
++#include <linux/interrupt.h>
++#include <linux/cpu.h>
++#include <linux/module.h>
++
++#include <asm/mtrr.h>
++#include <asm/tlbflush.h>
++#if 0
++#include <mach_apic.h>
++#endif
++#include <xen/evtchn.h>
++
++/*
++ * Some notes on x86 processor bugs affecting SMP operation:
++ *
++ * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
++ * The Linux implications for SMP are handled as follows:
++ *
++ * Pentium III / [Xeon]
++ * None of the E1AP-E3AP errata are visible to the user.
++ *
++ * E1AP. see PII A1AP
++ * E2AP. see PII A2AP
++ * E3AP. see PII A3AP
++ *
++ * Pentium II / [Xeon]
++ * None of the A1AP-A3AP errata are visible to the user.
++ *
++ * A1AP. see PPro 1AP
++ * A2AP. see PPro 2AP
++ * A3AP. see PPro 7AP
++ *
++ * Pentium Pro
++ * None of 1AP-9AP errata are visible to the normal user,
++ * except occasional delivery of 'spurious interrupt' as trap #15.
++ * This is very rare and a non-problem.
++ *
++ * 1AP. Linux maps APIC as non-cacheable
++ * 2AP. worked around in hardware
++ * 3AP. fixed in C0 and above steppings microcode update.
++ * Linux does not use excessive STARTUP_IPIs.
++ * 4AP. worked around in hardware
++ * 5AP. symmetric IO mode (normal Linux operation) not affected.
++ * 'noapic' mode has vector 0xf filled out properly.
++ * 6AP. 'noapic' mode might be affected - fixed in later steppings
++ * 7AP. We do not assume writes to the LVT deassering IRQs
++ * 8AP. We do not enable low power mode (deep sleep) during MP bootup
++ * 9AP. We do not use mixed mode
++ *
++ * Pentium
++ * There is a marginal case where REP MOVS on 100MHz SMP
++ * machines with B stepping processors can fail. XXX should provide
++ * an L1cache=Writethrough or L1cache=off option.
++ *
++ * B stepping CPUs may hang. There are hardware work arounds
++ * for this. We warn about it in case your board doesn't have the work
++ * arounds. Basically thats so I can tell anyone with a B stepping
++ * CPU and SMP problems "tough".
++ *
++ * Specific items [From Pentium Processor Specification Update]
++ *
++ * 1AP. Linux doesn't use remote read
++ * 2AP. Linux doesn't trust APIC errors
++ * 3AP. We work around this
++ * 4AP. Linux never generated 3 interrupts of the same priority
++ * to cause a lost local interrupt.
++ * 5AP. Remote read is never used
++ * 6AP. not affected - worked around in hardware
++ * 7AP. not affected - worked around in hardware
++ * 8AP. worked around in hardware - we get explicit CS errors if not
++ * 9AP. only 'noapic' mode affected. Might generate spurious
++ * interrupts, we log only the first one and count the
++ * rest silently.
++ * 10AP. not affected - worked around in hardware
++ * 11AP. Linux reads the APIC between writes to avoid this, as per
++ * the documentation. Make sure you preserve this as it affects
++ * the C stepping chips too.
++ * 12AP. not affected - worked around in hardware
++ * 13AP. not affected - worked around in hardware
++ * 14AP. we always deassert INIT during bootup
++ * 15AP. not affected - worked around in hardware
++ * 16AP. not affected - worked around in hardware
++ * 17AP. not affected - worked around in hardware
++ * 18AP. not affected - worked around in hardware
++ * 19AP. not affected - worked around in BIOS
++ *
++ * If this sounds worrying believe me these bugs are either ___RARE___,
++ * or are signal timing bugs worked around in hardware and there's
++ * about nothing of note with C stepping upwards.
++ */
++
++DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
++
++/*
++ * the following functions deal with sending IPIs between CPUs.
++ *
++ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
++ */
++
++static inline int __prepare_ICR (unsigned int shortcut, int vector)
++{
++ unsigned int icr = shortcut | APIC_DEST_LOGICAL;
++
++ switch (vector) {
++ default:
++ icr |= APIC_DM_FIXED | vector;
++ break;
++ case NMI_VECTOR:
++ icr |= APIC_DM_NMI;
++ break;
++ }
++ return icr;
++}
++
++static inline int __prepare_ICR2 (unsigned int mask)
++{
++ return SET_APIC_DEST_FIELD(mask);
++}
++
++DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
++
++static inline void __send_IPI_one(unsigned int cpu, int vector)
++{
++ int irq = per_cpu(ipi_to_irq, cpu)[vector];
++ BUG_ON(irq < 0);
++ notify_remote_via_irq(irq);
++}
++
++void __send_IPI_shortcut(unsigned int shortcut, int vector)
++{
++ int cpu;
++
++ switch (shortcut) {
++ case APIC_DEST_SELF:
++ __send_IPI_one(smp_processor_id(), vector);
++ break;
++ case APIC_DEST_ALLBUT:
++ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
++ if (cpu == smp_processor_id())
++ continue;
++ if (cpu_isset(cpu, cpu_online_map)) {
++ __send_IPI_one(cpu, vector);
++ }
++ }
++ break;
++ default:
++ printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
++ vector);
++ break;
++ }
++}
++
++void fastcall send_IPI_self(int vector)
++{
++ __send_IPI_shortcut(APIC_DEST_SELF, vector);
++}
++
++/*
++ * This is only used on smaller machines.
++ */
++void send_IPI_mask_bitmask(cpumask_t mask, int vector)
++{
++ unsigned long flags;
++ unsigned int cpu;
++
++ local_irq_save(flags);
++ WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
++
++ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
++ if (cpu_isset(cpu, mask)) {
++ __send_IPI_one(cpu, vector);
++ }
++ }
++
++ local_irq_restore(flags);
++}
++
++void send_IPI_mask_sequence(cpumask_t mask, int vector)
++{
++
++ send_IPI_mask_bitmask(mask, vector);
++}
++
++#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
++
++#if 0 /* XEN */
++/*
++ * Smarter SMP flushing macros.
++ * c/o Linus Torvalds.
++ *
++ * These mean you can really definitely utterly forget about
++ * writing to user space from interrupts. (Its not allowed anyway).
++ *
++ * Optimizations Manfred Spraul <manfred@colorfullife.com>
++ */
++
++static cpumask_t flush_cpumask;
++static struct mm_struct * flush_mm;
++static unsigned long flush_va;
++static DEFINE_SPINLOCK(tlbstate_lock);
++#define FLUSH_ALL 0xffffffff
++
++/*
++ * We cannot call mmdrop() because we are in interrupt context,
++ * instead update mm->cpu_vm_mask.
++ *
++ * We need to reload %cr3 since the page tables may be going
++ * away from under us..
++ */
++static inline void leave_mm (unsigned long cpu)
++{
++ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
++ BUG();
++ cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
++ load_cr3(swapper_pg_dir);
++}
++
++/*
++ *
++ * The flush IPI assumes that a thread switch happens in this order:
++ * [cpu0: the cpu that switches]
++ * 1) switch_mm() either 1a) or 1b)
++ * 1a) thread switch to a different mm
++ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
++ * Stop ipi delivery for the old mm. This is not synchronized with
++ * the other cpus, but smp_invalidate_interrupt ignore flush ipis
++ * for the wrong mm, and in the worst case we perform a superflous
++ * tlb flush.
++ * 1a2) set cpu_tlbstate to TLBSTATE_OK
++ * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
++ * was in lazy tlb mode.
++ * 1a3) update cpu_tlbstate[].active_mm
++ * Now cpu0 accepts tlb flushes for the new mm.
++ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
++ * Now the other cpus will send tlb flush ipis.
++ * 1a4) change cr3.
++ * 1b) thread switch without mm change
++ * cpu_tlbstate[].active_mm is correct, cpu0 already handles
++ * flush ipis.
++ * 1b1) set cpu_tlbstate to TLBSTATE_OK
++ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
++ * Atomically set the bit [other cpus will start sending flush ipis],
++ * and test the bit.
++ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
++ * 2) switch %%esp, ie current
++ *
++ * The interrupt must handle 2 special cases:
++ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
++ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
++ * runs in kernel space, the cpu could load tlb entries for user space
++ * pages.
++ *
++ * The good news is that cpu_tlbstate is local to each cpu, no
++ * write/read ordering problems.
++ */
++
++/*
++ * TLB flush IPI:
++ *
++ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
++ * 2) Leave the mm if we are in the lazy tlb mode.
++ */
++
++irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
++ struct pt_regs *regs)
++{
++ unsigned long cpu;
++
++ cpu = get_cpu();
++
++ if (!cpu_isset(cpu, flush_cpumask))
++ goto out;
++ /*
++ * This was a BUG() but until someone can quote me the
++ * line from the intel manual that guarantees an IPI to
++ * multiple CPUs is retried _only_ on the erroring CPUs
++ * its staying as a return
++ *
++ * BUG();
++ */
++
++ if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
++ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
++ if (flush_va == FLUSH_ALL)
++ local_flush_tlb();
++ else
++ __flush_tlb_one(flush_va);
++ } else
++ leave_mm(cpu);
++ }
++ smp_mb__before_clear_bit();
++ cpu_clear(cpu, flush_cpumask);
++ smp_mb__after_clear_bit();
++out:
++ put_cpu_no_resched();
++
++ return IRQ_HANDLED;
++}
++
++static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
++ unsigned long va)
++{
++ /*
++ * A couple of (to be removed) sanity checks:
++ *
++ * - current CPU must not be in mask
++ * - mask must exist :)
++ */
++ BUG_ON(cpus_empty(cpumask));
++ BUG_ON(cpu_isset(smp_processor_id(), cpumask));
++ BUG_ON(!mm);
++
++ /* If a CPU which we ran on has gone down, OK. */
++ cpus_and(cpumask, cpumask, cpu_online_map);
++ if (cpus_empty(cpumask))
++ return;
++
++ /*
++ * i'm not happy about this global shared spinlock in the
++ * MM hot path, but we'll see how contended it is.
++ * Temporarily this turns IRQs off, so that lockups are
++ * detected by the NMI watchdog.
++ */
++ spin_lock(&tlbstate_lock);
++
++ flush_mm = mm;
++ flush_va = va;
++#if NR_CPUS <= BITS_PER_LONG
++ atomic_set_mask(cpumask, &flush_cpumask);
++#else
++ {
++ int k;
++ unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
++ unsigned long *cpu_mask = (unsigned long *)&cpumask;
++ for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
++ atomic_set_mask(cpu_mask[k], &flush_mask[k]);
++ }
++#endif
++ /*
++ * We have to send the IPI only to
++ * CPUs affected.
++ */
++ send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
++
++ while (!cpus_empty(flush_cpumask))
++ /* nothing. lockup detection does not belong here */
++ mb();
++
++ flush_mm = NULL;
++ flush_va = 0;
++ spin_unlock(&tlbstate_lock);
++}
++
++void flush_tlb_current_task(void)
++{
++ struct mm_struct *mm = current->mm;
++ cpumask_t cpu_mask;
++
++ preempt_disable();
++ cpu_mask = mm->cpu_vm_mask;
++ cpu_clear(smp_processor_id(), cpu_mask);
++
++ local_flush_tlb();
++ if (!cpus_empty(cpu_mask))
++ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
++ preempt_enable();
++}
++
++void flush_tlb_mm (struct mm_struct * mm)
++{
++ cpumask_t cpu_mask;
++
++ preempt_disable();
++ cpu_mask = mm->cpu_vm_mask;
++ cpu_clear(smp_processor_id(), cpu_mask);
++
++ if (current->active_mm == mm) {
++ if (current->mm)
++ local_flush_tlb();
++ else
++ leave_mm(smp_processor_id());
++ }
++ if (!cpus_empty(cpu_mask))
++ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
++
++ preempt_enable();
++}
++
++void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
++{
++ struct mm_struct *mm = vma->vm_mm;
++ cpumask_t cpu_mask;
++
++ preempt_disable();
++ cpu_mask = mm->cpu_vm_mask;
++ cpu_clear(smp_processor_id(), cpu_mask);
++
++ if (current->active_mm == mm) {
++ if(current->mm)
++ __flush_tlb_one(va);
++ else
++ leave_mm(smp_processor_id());
++ }
++
++ if (!cpus_empty(cpu_mask))
++ flush_tlb_others(cpu_mask, mm, va);
++
++ preempt_enable();
++}
++EXPORT_SYMBOL(flush_tlb_page);
++
++static void do_flush_tlb_all(void* info)
++{
++ unsigned long cpu = smp_processor_id();
++
++ __flush_tlb_all();
++ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
++ leave_mm(cpu);
++}
++
++void flush_tlb_all(void)
++{
++ on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
++}
++
++#else
++
++irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
++ struct pt_regs *regs)
++{ return 0; }
++void flush_tlb_current_task(void)
++{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
++void flush_tlb_mm(struct mm_struct * mm)
++{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
++void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
++{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
++EXPORT_SYMBOL(flush_tlb_page);
++void flush_tlb_all(void)
++{ xen_tlb_flush_all(); }
++
++#endif /* XEN */
++
++/*
++ * this function sends a 'reschedule' IPI to another CPU.
++ * it goes straight through and wastes no time serializing
++ * anything. Worst case is that we lose a reschedule ...
++ */
++void smp_send_reschedule(int cpu)
++{
++ WARN_ON(cpu_is_offline(cpu));
++ send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
++}
++
++/*
++ * Structure and data for smp_call_function(). This is designed to minimise
++ * static memory requirements. It also looks cleaner.
++ */
++static DEFINE_SPINLOCK(call_lock);
++
++struct call_data_struct {
++ void (*func) (void *info);
++ void *info;
++ atomic_t started;
++ atomic_t finished;
++ int wait;
++};
++
++void lock_ipi_call_lock(void)
++{
++ spin_lock_irq(&call_lock);
++}
++
++void unlock_ipi_call_lock(void)
++{
++ spin_unlock_irq(&call_lock);
++}
++
++static struct call_data_struct *call_data;
++
++/**
++ * smp_call_function(): Run a function on all other CPUs.
++ * @func: The function to run. This must be fast and non-blocking.
++ * @info: An arbitrary pointer to pass to the function.
++ * @nonatomic: currently unused.
++ * @wait: If true, wait (atomically) until function has completed on other CPUs.
++ *
++ * Returns 0 on success, else a negative status code. Does not return until
++ * remote CPUs are nearly ready to execute <<func>> or are or have executed.
++ *
++ * You must not call this function with disabled interrupts or from a
++ * hardware interrupt handler or from a bottom half handler.
++ */
++int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
++ int wait)
++{
++ struct call_data_struct data;
++ int cpus;
++
++ /* Holding any lock stops cpus from going down. */
++ spin_lock(&call_lock);
++ cpus = num_online_cpus() - 1;
++ if (!cpus) {
++ spin_unlock(&call_lock);
++ return 0;
++ }
++
++ /* Can deadlock when called with interrupts disabled */
++ WARN_ON(irqs_disabled());
++
++ data.func = func;
++ data.info = info;
++ atomic_set(&data.started, 0);
++ data.wait = wait;
++ if (wait)
++ atomic_set(&data.finished, 0);
++
++ call_data = &data;
++ mb();
++
++ /* Send a message to all other CPUs and wait for them to respond */
++ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
++
++ /* Wait for response */
++ while (atomic_read(&data.started) != cpus)
++ barrier();
++
++ if (wait)
++ while (atomic_read(&data.finished) != cpus)
++ barrier();
++ spin_unlock(&call_lock);
++
++ return 0;
++}
++EXPORT_SYMBOL(smp_call_function);
++
++static void stop_this_cpu (void * dummy)
++{
++ /*
++ * Remove this CPU:
++ */
++ cpu_clear(smp_processor_id(), cpu_online_map);
++ local_irq_disable();
++#if 0
++ disable_local_APIC();
++#endif
++ if (cpu_data[smp_processor_id()].hlt_works_ok)
++ for(;;) halt();
++ for (;;);
++}
++
++/*
++ * this function calls the 'stop' function on all other CPUs in the system.
++ */
++
++void smp_send_stop(void)
++{
++ smp_call_function(stop_this_cpu, NULL, 1, 0);
++
++ local_irq_disable();
++#if 0
++ disable_local_APIC();
++#endif
++ local_irq_enable();
++}
++
++/*
++ * Reschedule call back. Nothing to do,
++ * all the work is done automatically when
++ * we return from the interrupt.
++ */
++irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
++ struct pt_regs *regs)
++{
++
++ return IRQ_HANDLED;
++}
++
++#include <linux/kallsyms.h>
++irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
++ struct pt_regs *regs)
++{
++ void (*func) (void *info) = call_data->func;
++ void *info = call_data->info;
++ int wait = call_data->wait;
++
++ /*
++ * Notify initiating CPU that I've grabbed the data and am
++ * about to execute the function
++ */
++ mb();
++ atomic_inc(&call_data->started);
++ /*
++ * At this point the info structure may be out of scope unless wait==1
++ */
++ irq_enter();
++ (*func)(info);
++ irq_exit();
++
++ if (wait) {
++ mb();
++ atomic_inc(&call_data->finished);
++ }
++
++ return IRQ_HANDLED;
++}
++
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ b/arch/i386/kernel/swiotlb.c 2007-08-27 14:02:10.000000000 -0400
+@@ -0,0 +1,716 @@
++/*
++ * Dynamic DMA mapping support.
++ *
++ * This implementation is a fallback for platforms that do not support
++ * I/O TLBs (aka DMA address translation hardware).
++ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
++ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
++ * Copyright (C) 2000, 2003 Hewlett-Packard Co
++ * David Mosberger-Tang <davidm@hpl.hp.com>
++ * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
++ */
++
++#include <linux/cache.h>
++#include <linux/mm.h>
++#include <linux/module.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include <linux/string.h>
++#include <linux/types.h>
++#include <linux/ctype.h>
++#include <linux/init.h>
++#include <linux/bootmem.h>
++#include <linux/highmem.h>
++#include <asm/io.h>
++#include <asm/pci.h>
++#include <asm/dma.h>
++#include <asm/uaccess.h>
++#include <xen/interface/memory.h>
++
++int swiotlb;
++EXPORT_SYMBOL(swiotlb);
++
++#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
++
++#define SG_ENT_PHYS_ADDRESS(sg) (page_to_bus((sg)->page) + (sg)->offset)
++
++/*
++ * Maximum allowable number of contiguous slabs to map,
++ * must be a power of 2. What is the appropriate value ?
++ * The complexity of {map,unmap}_single is linearly dependent on this value.
++ */
++#define IO_TLB_SEGSIZE 128
++
++/*
++ * log of the size of each IO TLB slab. The number of slabs is command line
++ * controllable.
++ */
++#define IO_TLB_SHIFT 11
++
++int swiotlb_force;
++
++static char *iotlb_virt_start;
++static unsigned long iotlb_nslabs;
++
++/*
++ * Used to do a quick range check in swiotlb_unmap_single and
++ * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
++ * API.
++ */
++static unsigned long iotlb_pfn_start, iotlb_pfn_end;
++
++/* Does the given dma address reside within the swiotlb aperture? */
++static inline int in_swiotlb_aperture(dma_addr_t dev_addr)
++{
++ unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT);
++ return (pfn_valid(pfn)
++ && (pfn >= iotlb_pfn_start)
++ && (pfn < iotlb_pfn_end));
++}
++
++/*
++ * When the IOMMU overflows we return a fallback buffer. This sets the size.
++ */
++static unsigned long io_tlb_overflow = 32*1024;
++
++void *io_tlb_overflow_buffer;
++
++/*
++ * This is a free list describing the number of free entries available from
++ * each index
++ */
++static unsigned int *io_tlb_list;
++static unsigned int io_tlb_index;
++
++/*
++ * We need to save away the original address corresponding to a mapped entry
++ * for the sync operations.
++ */
++static struct phys_addr {
++ struct page *page;
++ unsigned int offset;
++} *io_tlb_orig_addr;
++
++/*
++ * Protect the above data structures in the map and unmap calls
++ */
++static DEFINE_SPINLOCK(io_tlb_lock);
++
++static unsigned int dma_bits;
++static unsigned int __initdata max_dma_bits = 32;
++static int __init
++setup_dma_bits(char *str)
++{
++ max_dma_bits = simple_strtoul(str, NULL, 0);
++ return 0;
++}
++__setup("dma_bits=", setup_dma_bits);
++
++static int __init
++setup_io_tlb_npages(char *str)
++{
++ /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
++ if (isdigit(*str)) {
++ iotlb_nslabs = simple_strtoul(str, &str, 0) <<
++ (20 - IO_TLB_SHIFT);
++ iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
++ /* Round up to power of two (xen_create_contiguous_region). */
++ while (iotlb_nslabs & (iotlb_nslabs-1))
++ iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
++ }
++ if (*str == ',')
++ ++str;
++ /*
++ * NB. 'force' enables the swiotlb, but doesn't force its use for
++ * every DMA like it does on native Linux. 'off' forcibly disables
++ * use of the swiotlb.
++ */
++ if (!strcmp(str, "force"))
++ swiotlb_force = 1;
++ else if (!strcmp(str, "off"))
++ swiotlb_force = -1;
++ return 1;
++}
++__setup("swiotlb=", setup_io_tlb_npages);
++/* make io_tlb_overflow tunable too? */
++
++/*
++ * Statically reserve bounce buffer space and initialize bounce buffer data
++ * structures for the software IO TLB used to implement the PCI DMA API.
++ */
++void
++swiotlb_init_with_default_size (size_t default_size)
++{
++ unsigned long i, bytes;
++ int rc;
++
++ if (!iotlb_nslabs) {
++ iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
++ iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
++ /* Round up to power of two (xen_create_contiguous_region). */
++ while (iotlb_nslabs & (iotlb_nslabs-1))
++ iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
++ }
++
++ bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
++
++ /*
++ * Get IO TLB memory from the low pages
++ */
++ iotlb_virt_start = alloc_bootmem_low_pages(bytes);
++ if (!iotlb_virt_start)
++ panic("Cannot allocate SWIOTLB buffer!\n");
++
++ dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
++ for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) {
++ do {
++ rc = xen_create_contiguous_region(
++ (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT),
++ get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
++ dma_bits);
++ } while (rc && dma_bits++ < max_dma_bits);
++ if (rc) {
++ if (i == 0)
++ panic("No suitable physical memory available for SWIOTLB buffer!\n"
++ "Use dom0_mem Xen boot parameter to reserve\n"
++ "some DMA memory (e.g., dom0_mem=-128M).\n");
++ iotlb_nslabs = i;
++ i <<= IO_TLB_SHIFT;
++ free_bootmem(__pa(iotlb_virt_start + i), bytes - i);
++ bytes = i;
++ for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) {
++ unsigned int bits = fls64(virt_to_bus(iotlb_virt_start + i - 1));
++
++ if (bits > dma_bits)
++ dma_bits = bits;
++ }
++ break;
++ }
++ }
++
++ /*
++ * Allocate and initialize the free list array. This array is used
++ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
++ */
++ io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
++ for (i = 0; i < iotlb_nslabs; i++)
++ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
++ io_tlb_index = 0;
++ io_tlb_orig_addr = alloc_bootmem(
++ iotlb_nslabs * sizeof(*io_tlb_orig_addr));
++
++ /*
++ * Get the overflow emergency buffer
++ */
++ io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
++ if (!io_tlb_overflow_buffer)
++ panic("Cannot allocate SWIOTLB overflow buffer!\n");
++
++ do {
++ rc = xen_create_contiguous_region(
++ (unsigned long)io_tlb_overflow_buffer,
++ get_order(io_tlb_overflow),
++ dma_bits);
++ } while (rc && dma_bits++ < max_dma_bits);
++ if (rc)
++ panic("No suitable physical memory available for SWIOTLB overflow buffer!\n");
++
++ iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT;
++ iotlb_pfn_end = iotlb_pfn_start + (bytes >> PAGE_SHIFT);
++
++ printk(KERN_INFO "Software IO TLB enabled: \n"
++ " Aperture: %lu megabytes\n"
++ " Kernel range: %p - %p\n"
++ " Address size: %u bits\n",
++ bytes >> 20,
++ iotlb_virt_start, iotlb_virt_start + bytes,
++ dma_bits);
++}
++
++void
++swiotlb_init(void)
++{
++ long ram_end;
++ size_t defsz = 64 * (1 << 20); /* 64MB default size */
++
++ if (swiotlb_force == 1) {
++ swiotlb = 1;
++ } else if ((swiotlb_force != -1) &&
++ is_running_on_xen() &&
++ is_initial_xendomain()) {
++ /* Domain 0 always has a swiotlb. */
++ ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
++ if (ram_end <= 0x7ffff)
++ defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */
++ swiotlb = 1;
++ }
++
++ if (swiotlb)
++ swiotlb_init_with_default_size(defsz);
++ else
++ printk(KERN_INFO "Software IO TLB disabled\n");
++}
++
++/*
++ * We use __copy_to_user_inatomic to transfer to the host buffer because the
++ * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
++ * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
++ * unnecessary copy from the aperture to the host buffer, and a page fault.
++ */
++static void
++__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
++{
++ if (PageHighMem(buffer.page)) {
++ size_t len, bytes;
++ char *dev, *host, *kmp;
++ len = size;
++ while (len != 0) {
++ unsigned long flags;
++
++ if (((bytes = len) + buffer.offset) > PAGE_SIZE)
++ bytes = PAGE_SIZE - buffer.offset;
++ local_irq_save(flags); /* protects KM_BOUNCE_READ */
++ kmp = kmap_atomic(buffer.page, KM_BOUNCE_READ);
++ dev = dma_addr + size - len;
++ host = kmp + buffer.offset;
++ if (dir == DMA_FROM_DEVICE) {
++ if (__copy_to_user_inatomic(host, dev, bytes))
++ /* inaccessible */;
++ } else
++ memcpy(dev, host, bytes);
++ kunmap_atomic(kmp, KM_BOUNCE_READ);
++ local_irq_restore(flags);
++ len -= bytes;
++ buffer.page++;
++ buffer.offset = 0;
++ }
++ } else {
++ char *host = (char *)phys_to_virt(
++ page_to_pseudophys(buffer.page)) + buffer.offset;
++ if (dir == DMA_FROM_DEVICE) {
++ if (__copy_to_user_inatomic(host, dma_addr, size))
++ /* inaccessible */;
++ } else if (dir == DMA_TO_DEVICE)
++ memcpy(dma_addr, host, size);
++ }
++}
++
++/*
++ * Allocates bounce buffer and returns its kernel virtual address.
++ */
++static void *
++map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
++{
++ unsigned long flags;
++ char *dma_addr;
++ unsigned int nslots, stride, index, wrap;
++ int i;
++
++ /*
++ * For mappings greater than a page, we limit the stride (and
++ * hence alignment) to a page size.
++ */
++ nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
++ if (size > PAGE_SIZE)
++ stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
++ else
++ stride = 1;
++
++ BUG_ON(!nslots);
++
++ /*
++ * Find suitable number of IO TLB entries size that will fit this
++ * request and allocate a buffer from that IO TLB pool.
++ */
++ spin_lock_irqsave(&io_tlb_lock, flags);
++ {
++ wrap = index = ALIGN(io_tlb_index, stride);
++
++ if (index >= iotlb_nslabs)
++ wrap = index = 0;
++
++ do {
++ /*
++ * If we find a slot that indicates we have 'nslots'
++ * number of contiguous buffers, we allocate the
++ * buffers from that slot and mark the entries as '0'
++ * indicating unavailable.
++ */
++ if (io_tlb_list[index] >= nslots) {
++ int count = 0;
++
++ for (i = index; i < (int)(index + nslots); i++)
++ io_tlb_list[i] = 0;
++ for (i = index - 1;
++ (OFFSET(i, IO_TLB_SEGSIZE) !=
++ IO_TLB_SEGSIZE -1) && io_tlb_list[i];
++ i--)
++ io_tlb_list[i] = ++count;
++ dma_addr = iotlb_virt_start +
++ (index << IO_TLB_SHIFT);
++
++ /*
++ * Update the indices to avoid searching in
++ * the next round.
++ */
++ io_tlb_index =
++ ((index + nslots) < iotlb_nslabs
++ ? (index + nslots) : 0);
++
++ goto found;
++ }
++ index += stride;
++ if (index >= iotlb_nslabs)
++ index = 0;
++ } while (index != wrap);
++
++ spin_unlock_irqrestore(&io_tlb_lock, flags);
++ return NULL;
++ }
++ found:
++ spin_unlock_irqrestore(&io_tlb_lock, flags);
++
++ /*
++ * Save away the mapping from the original address to the DMA address.
++ * This is needed when we sync the memory. Then we sync the buffer if
++ * needed.
++ */
++ io_tlb_orig_addr[index] = buffer;
++ if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
++ __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
++
++ return dma_addr;
++}
++
++/*
++ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
++ */
++static void
++unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
++{
++ unsigned long flags;
++ int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
++ int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
++ struct phys_addr buffer = io_tlb_orig_addr[index];
++
++ /*
++ * First, sync the memory before unmapping the entry
++ */
++ if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
++ __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
++
++ /*
++ * Return the buffer to the free list by setting the corresponding
++ * entries to indicate the number of contigous entries available.
++ * While returning the entries to the free list, we merge the entries
++ * with slots below and above the pool being returned.
++ */
++ spin_lock_irqsave(&io_tlb_lock, flags);
++ {
++ count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
++ io_tlb_list[index + nslots] : 0);
++ /*
++ * Step 1: return the slots to the free list, merging the
++ * slots with superceeding slots
++ */
++ for (i = index + nslots - 1; i >= index; i--)
++ io_tlb_list[i] = ++count;
++ /*
++ * Step 2: merge the returned slots with the preceding slots,
++ * if available (non zero)
++ */
++ for (i = index - 1;
++ (OFFSET(i, IO_TLB_SEGSIZE) !=
++ IO_TLB_SEGSIZE -1) && io_tlb_list[i];
++ i--)
++ io_tlb_list[i] = ++count;
++ }
++ spin_unlock_irqrestore(&io_tlb_lock, flags);
++}
++
++static void
++sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
++{
++ int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
++ struct phys_addr buffer = io_tlb_orig_addr[index];
++ BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
++ __sync_single(buffer, dma_addr, size, dir);
++}
++
++static void
++swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
++{
++ /*
++ * Ran out of IOMMU space for this operation. This is very bad.
++ * Unfortunately the drivers cannot handle this operation properly.
++ * unless they check for pci_dma_mapping_error (most don't)
++ * When the mapping is small enough return a static buffer to limit
++ * the damage, or panic when the transfer is too big.
++ */
++ printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
++ "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?");
++
++ if (size > io_tlb_overflow && do_panic) {
++ if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
++ panic("PCI-DMA: Memory would be corrupted\n");
++ if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
++ panic("PCI-DMA: Random memory would be DMAed\n");
++ }
++}
++
++/*
++ * Map a single buffer of the indicated size for DMA in streaming mode. The
++ * PCI address to use is returned.
++ *
++ * Once the device is given the dma address, the device owns this memory until
++ * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
++ */
++dma_addr_t
++swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
++{
++ dma_addr_t dev_addr = virt_to_bus(ptr);
++ void *map;
++ struct phys_addr buffer;
++
++ BUG_ON(dir == DMA_NONE);
++
++ /*
++ * If the pointer passed in happens to be in the device's DMA window,
++ * we can safely return the device addr and not worry about bounce
++ * buffering it.
++ */
++ if (!range_straddles_page_boundary(ptr, size) &&
++ !address_needs_mapping(hwdev, dev_addr))
++ return dev_addr;
++
++ /*
++ * Oh well, have to allocate and map a bounce buffer.
++ */
++ buffer.page = virt_to_page(ptr);
++ buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
++ map = map_single(hwdev, buffer, size, dir);
++ if (!map) {
++ swiotlb_full(hwdev, size, dir, 1);
++ map = io_tlb_overflow_buffer;
++ }
++
++ dev_addr = virt_to_bus(map);
++ return dev_addr;
++}
++
++/*
++ * Unmap a single streaming mode DMA translation. The dma_addr and size must
++ * match what was provided for in a previous swiotlb_map_single call. All
++ * other usages are undefined.
++ *
++ * After this call, reads by the cpu to the buffer are guaranteed to see
++ * whatever the device wrote there.
++ */
++void
++swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
++ int dir)
++{
++ BUG_ON(dir == DMA_NONE);
++ if (in_swiotlb_aperture(dev_addr))
++ unmap_single(hwdev, bus_to_virt(dev_addr), size, dir);
++}
++
++/*
++ * Make physical memory consistent for a single streaming mode DMA translation
++ * after a transfer.
++ *
++ * If you perform a swiotlb_map_single() but wish to interrogate the buffer
++ * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
++ * call this function before doing so. At the next point you give the PCI dma
++ * address back to the card, you must first perform a
++ * swiotlb_dma_sync_for_device, and then the device again owns the buffer
++ */
++void
++swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
++ size_t size, int dir)
++{
++ BUG_ON(dir == DMA_NONE);
++ if (in_swiotlb_aperture(dev_addr))
++ sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
++}
++
++void
++swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
++ size_t size, int dir)
++{
++ BUG_ON(dir == DMA_NONE);
++ if (in_swiotlb_aperture(dev_addr))
++ sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
++}
++
++/*
++ * Map a set of buffers described by scatterlist in streaming mode for DMA.
++ * This is the scatter-gather version of the above swiotlb_map_single
++ * interface. Here the scatter gather list elements are each tagged with the
++ * appropriate dma address and length. They are obtained via
++ * sg_dma_{address,length}(SG).
++ *
++ * NOTE: An implementation may be able to use a smaller number of
++ * DMA address/length pairs than there are SG table elements.
++ * (for example via virtual mapping capabilities)
++ * The routine returns the number of addr/length pairs actually
++ * used, at most nents.
++ *
++ * Device ownership issues as mentioned above for swiotlb_map_single are the
++ * same here.
++ */
++int
++swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
++ int dir)
++{
++ struct phys_addr buffer;
++ dma_addr_t dev_addr;
++ char *map;
++ int i;
++
++ BUG_ON(dir == DMA_NONE);
++
++ for (i = 0; i < nelems; i++, sg++) {
++ dev_addr = SG_ENT_PHYS_ADDRESS(sg);
++ if (address_needs_mapping(hwdev, dev_addr)) {
++ buffer.page = sg->page;
++ buffer.offset = sg->offset;
++ map = map_single(hwdev, buffer, sg->length, dir);
++ if (!map) {
++ /* Don't panic here, we expect map_sg users
++ to do proper error handling. */
++ swiotlb_full(hwdev, sg->length, dir, 0);
++ swiotlb_unmap_sg(hwdev, sg - i, i, dir);
++ sg[0].dma_length = 0;
++ return 0;
++ }
++ sg->dma_address = (dma_addr_t)virt_to_bus(map);
++ } else
++ sg->dma_address = dev_addr;
++ sg->dma_length = sg->length;
++ }
++ return nelems;
++}
++
++/*
++ * Unmap a set of streaming mode DMA translations. Again, cpu read rules
++ * concerning calls here are the same as for swiotlb_unmap_single() above.
++ */
++void
++swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
++ int dir)
++{
++ int i;
++
++ BUG_ON(dir == DMA_NONE);
++
++ for (i = 0; i < nelems; i++, sg++)
++ if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
++ unmap_single(hwdev,
++ (void *)bus_to_virt(sg->dma_address),
++ sg->dma_length, dir);
++}
++
++/*
++ * Make physical memory consistent for a set of streaming mode DMA translations
++ * after a transfer.
++ *
++ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
++ * and usage.
++ */
++void
++swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
++ int nelems, int dir)
++{
++ int i;
++
++ BUG_ON(dir == DMA_NONE);
++
++ for (i = 0; i < nelems; i++, sg++)
++ if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
++ sync_single(hwdev,
++ (void *)bus_to_virt(sg->dma_address),
++ sg->dma_length, dir);
++}
++
++void
++swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
++ int nelems, int dir)
++{
++ int i;
++
++ BUG_ON(dir == DMA_NONE);
++
++ for (i = 0; i < nelems; i++, sg++)
++ if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
++ sync_single(hwdev,
++ (void *)bus_to_virt(sg->dma_address),
++ sg->dma_length, dir);
++}
++
++#ifdef CONFIG_HIGHMEM
++
++dma_addr_t
++swiotlb_map_page(struct device *hwdev, struct page *page,
++ unsigned long offset, size_t size,
++ enum dma_data_direction direction)
++{
++ struct phys_addr buffer;
++ dma_addr_t dev_addr;
++ char *map;
++
++ dev_addr = page_to_bus(page) + offset;
++ if (address_needs_mapping(hwdev, dev_addr)) {
++ buffer.page = page;
++ buffer.offset = offset;
++ map = map_single(h